diff options
-rw-r--r-- | gnu/llvm/llvm/lib/Target/ARM/ARMISelLowering.cpp | 7518 |
1 files changed, 1506 insertions, 6012 deletions
diff --git a/gnu/llvm/llvm/lib/Target/ARM/ARMISelLowering.cpp b/gnu/llvm/llvm/lib/Target/ARM/ARMISelLowering.cpp index 2e78b52d099..66f3f418d06 100644 --- a/gnu/llvm/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/gnu/llvm/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21,7 +21,6 @@ #include "ARMRegisterInfo.h" #include "ARMSelectionDAGInfo.h" #include "ARMSubtarget.h" -#include "ARMTargetTransformInfo.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "MCTargetDesc/ARMBaseInfo.h" #include "Utils/ARMBaseInfo.h" @@ -55,7 +54,6 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -110,7 +108,6 @@ #include <cstdlib> #include <iterator> #include <limits> -#include <optional> #include <string> #include <tuple> #include <utility> @@ -146,7 +143,7 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal( cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128)); -cl::opt<unsigned> +static cl::opt<unsigned> MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2)); @@ -156,7 +153,8 @@ static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; -void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { +void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, + MVT PromotedBitwiseVT) { if (VT != PromotedLdStVT) { setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); @@ -195,6 +193,16 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { setOperationAction(ISD::SRL, VT, Custom); } + // Promote all bit-wise operations. + if (VT.isInteger() && VT != PromotedBitwiseVT) { + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); + } + // Neon does not support vector divide/remainder operations. setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); @@ -202,8 +210,6 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::SDIVREM, VT, Expand); - setOperationAction(ISD::UDIVREM, VT, Expand); if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) @@ -216,12 +222,12 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { void ARMTargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &ARM::DPRRegClass); - addTypeForNEON(VT, MVT::f64); + addTypeForNEON(VT, MVT::f64, MVT::v2i32); } void ARMTargetLowering::addQRTypeForNEON(MVT VT) { addRegisterClass(VT, &ARM::DPairRegClass); - addTypeForNEON(VT, MVT::v2f64); + addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } void ARMTargetLowering::setAllExpand(MVT VT) { @@ -272,23 +278,13 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); - setOperationAction(ISD::ABDS, VT, Legal); - setOperationAction(ISD::ABDU, VT, Legal); - setOperationAction(ISD::AVGFLOORS, VT, Legal); - setOperationAction(ISD::AVGFLOORU, VT, Legal); - setOperationAction(ISD::AVGCEILS, VT, Legal); - setOperationAction(ISD::AVGCEILU, VT, Legal); // No native support for these. setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); - setOperationAction(ISD::UDIVREM, VT, Expand); - setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); - setOperationAction(ISD::SELECT, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); // Vector reductions setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); @@ -296,19 +292,12 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); - setOperationAction(ISD::VECREDUCE_MUL, VT, Custom); - setOperationAction(ISD::VECREDUCE_AND, VT, Custom); - setOperationAction(ISD::VECREDUCE_OR, VT, Custom); - setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); - } else { - setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); - setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); } // Pre and Post inc are supported on loads and stores @@ -338,8 +327,6 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSTORE, VT, Legal); - setOperationAction(ISD::SELECT, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); // Pre and Post inc are supported on loads and stores for (unsigned im = (unsigned)ISD::PRE_INC; @@ -354,10 +341,6 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::FMINNUM, VT, Legal); setOperationAction(ISD::FMAXNUM, VT, Legal); setOperationAction(ISD::FROUND, VT, Legal); - setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); - setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); - setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); - setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); // No native support for these. setOperationAction(ISD::FDIV, VT, Expand); @@ -375,17 +358,6 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { } } - // Custom Expand smaller than legal vector reductions to prevent false zero - // items being added. - setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom); - setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom); - setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom); - setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom); - setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom); - setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom); - setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom); - setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom); - // We 'support' these types up to bitcast/load/store level, regardless of // MVE integer-only / float support. Only doing FP data processing on the FP // vector types is inhibited at integer-only level. @@ -396,11 +368,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); - // We can do bitwise operations on v2i64 vectors setOperationAction(ISD::AND, MVT::v2i64, Legal); setOperationAction(ISD::OR, MVT::v2i64, Legal); @@ -435,7 +403,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { } // Predicate types - const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1}; + const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; for (auto VT : pTypes) { addRegisterClass(VT, &ARM::VCCRRegClass); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -448,36 +416,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); - setOperationAction(ISD::TRUNCATE, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Expand); - setOperationAction(ISD::SELECT, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); - - if (!HasMVEFP) { - setOperationAction(ISD::SINT_TO_FP, VT, Expand); - setOperationAction(ISD::UINT_TO_FP, VT, Expand); - setOperationAction(ISD::FP_TO_SINT, VT, Expand); - setOperationAction(ISD::FP_TO_UINT, VT, Expand); - } } - setOperationAction(ISD::SETCC, MVT::v2i1, Expand); - setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand); - setOperationAction(ISD::AND, MVT::v2i1, Expand); - setOperationAction(ISD::OR, MVT::v2i1, Expand); - setOperationAction(ISD::XOR, MVT::v2i1, Expand); - setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Expand); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Expand); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Expand); - - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); } ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, @@ -490,7 +429,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && - !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) { + !Subtarget->isTargetWatchOS()) { bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), @@ -572,9 +511,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); - setLibcallName(RTLIB::MUL_I128, nullptr); - setLibcallName(RTLIB::MULO_I64, nullptr); - setLibcallName(RTLIB::MULO_I128, nullptr); // RTLIB if (Subtarget->isAAPCS_ABI() && @@ -772,12 +708,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, Subtarget->hasFPRegs()) { addRegisterClass(MVT::f32, &ARM::SPRRegClass); addRegisterClass(MVT::f64, &ARM::DPRRegClass); - - setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); - if (!Subtarget->hasVFP2Base()) setAllExpand(MVT::f32); if (!Subtarget->hasFP64()) @@ -787,26 +717,22 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->hasFullFP16()) { addRegisterClass(MVT::f16, &ARM::HPRRegClass); setOperationAction(ISD::BITCAST, MVT::i16, Custom); + setOperationAction(ISD::BITCAST, MVT::i32, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); setOperationAction(ISD::FMINNUM, MVT::f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); } - if (Subtarget->hasBF16()) { - addRegisterClass(MVT::bf16, &ARM::HPRRegClass); - setAllExpand(MVT::bf16); - if (!Subtarget->hasFullFP16()) - setOperationAction(ISD::BITCAST, MVT::bf16, Custom); - } - for (MVT VT : MVT::fixedlen_vector_valuetypes()) { for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); addAllExtLoads(VT, InnerVT, Expand); } + setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); @@ -823,7 +749,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // Combine low-overhead loop intrinsics so that we can lower i1 types. if (Subtarget->hasLOB()) { - setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC}); + setTargetDAGCombine(ISD::BRCOND); + setTargetDAGCombine(ISD::BR_CC); } if (Subtarget->hasNEON()) { @@ -844,11 +771,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v8f16); addDRTypeForNEON(MVT::v4f16); } - - if (Subtarget->hasBF16()) { - addQRTypeForNEON(MVT::v8bf16); - addDRTypeForNEON(MVT::v4bf16); - } } if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { @@ -984,19 +906,22 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); - for (MVT VT : MVT::fixedlen_vector_valuetypes()) { - setOperationAction(ISD::MULHS, VT, Expand); - setOperationAction(ISD::MULHU, VT, Expand); - } - // NEON only has FMA instructions as of VFP4. if (!Subtarget->hasVFP4Base()) { setOperationAction(ISD::FMA, MVT::v2f32, Expand); setOperationAction(ISD::FMA, MVT::v4f32, Expand); } - setTargetDAGCombine({ISD::SHL, ISD::SRL, ISD::SRA, ISD::FP_TO_SINT, - ISD::FP_TO_UINT, ISD::FDIV, ISD::LOAD}); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::FP_TO_UINT); + setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::LOAD); // It is legal to extload from v4i8 to v4i16 or v4i32. for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, @@ -1010,20 +935,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { - setTargetDAGCombine( - {ISD::BUILD_VECTOR, ISD::VECTOR_SHUFFLE, ISD::INSERT_SUBVECTOR, - ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, - ISD::SIGN_EXTEND_INREG, ISD::STORE, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND, - ISD::ANY_EXTEND, ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN, - ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST}); - } - if (Subtarget->hasMVEIntegerOps()) { - setTargetDAGCombine({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX, - ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC, - ISD::SETCC}); - } - if (Subtarget->hasMVEFloatOps()) { - setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); } if (!Subtarget->hasFP64()) { @@ -1131,10 +1049,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); setOperationAction(ISD::SADDSAT, MVT::i16, Custom); setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); - setOperationAction(ISD::UADDSAT, MVT::i8, Custom); - setOperationAction(ISD::USUBSAT, MVT::i8, Custom); - setOperationAction(ISD::UADDSAT, MVT::i16, Custom); - setOperationAction(ISD::USUBSAT, MVT::i16, Custom); } if (Subtarget->hasBaseDSP()) { setOperationAction(ISD::SADDSAT, MVT::i32, Legal); @@ -1159,8 +1073,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRA, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); - setOperationAction(ISD::LOAD, MVT::i64, Custom); - setOperationAction(ISD::STORE, MVT::i64, Custom); // MVE lowers 64 bit shifts to lsll and lsrl // assuming that ISD::SRL and SRA of i64 are already marked custom @@ -1357,32 +1269,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } } - // Compute supported atomic widths. - if (Subtarget->isTargetLinux() || - (!Subtarget->isMClass() && Subtarget->hasV6Ops())) { - // For targets where __sync_* routines are reliably available, we use them - // if necessary. - // - // ARM Linux always supports 64-bit atomics through kernel-assisted atomic - // routines (kernel 3.1 or later). FIXME: Not with compiler-rt? - // - // ARMv6 targets have native instructions in ARM mode. For Thumb mode, - // such targets should provide __sync_* routines, which use the ARM mode - // instructions. (ARMv6 doesn't have dmb, but it has an equivalent - // encoding; see ARMISD::MEMBARRIER_MCR.) - setMaxAtomicSizeInBitsSupported(64); - } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) || - Subtarget->hasForced32BitAtomics()) { - // Cortex-M (besides Cortex-M0) have 32-bit atomics. - setMaxAtomicSizeInBitsSupported(32); - } else { - // We can't assume anything about other targets; just use libatomic - // routines. - setMaxAtomicSizeInBitsSupported(0); - } - - setMaxDivRemBitWidthSupported(64); - setOperationAction(ISD::PREFETCH, MVT::Other, Custom); // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. @@ -1397,8 +1283,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR // iff target supports vfp2. setOperationAction(ISD::BITCAST, MVT::i64, Custom); - setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); - setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); + setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); } // We want to custom lower some of our intrinsics. @@ -1534,16 +1419,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } if (Subtarget->hasNEON()) { - // vmin and vmax aren't available in a scalar form, so we can use - // a NEON instruction with an undef lane instead. This has a performance - // penalty on some cores, so we don't do this unless we have been - // asked to by the core tuning model. - if (Subtarget->useNEONForSinglePrecisionFP()) { - setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); - setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); - setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); - setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); - } + // vmin and vmax aren't available in a scalar form, so we use + // a NEON instruction with an undef lane instead. + setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); + setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); @@ -1564,21 +1445,17 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // We have target-specific dag combine patterns for the following nodes: // ARMISD::VMOVRRD - No need to call setTargetDAGCombine - setTargetDAGCombine( - {ISD::ADD, ISD::SUB, ISD::MUL, ISD::AND, ISD::OR, ISD::XOR}); - - if (Subtarget->hasMVEIntegerOps()) - setTargetDAGCombine(ISD::VSELECT); + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::XOR); if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); if (Subtarget->isThumb1Only()) setTargetDAGCombine(ISD::SHL); - // Attempt to lower smin/smax to ssat/usat - if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || - Subtarget->isThumb2()) { - setTargetDAGCombine({ISD::SMIN, ISD::SMAX}); - } setStackPointerRegisterToSaveRestore(ARM::SP); @@ -1664,216 +1541,170 @@ ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, } const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { -#define MAKE_CASE(V) \ - case V: \ - return #V; switch ((ARMISD::NodeType)Opcode) { - case ARMISD::FIRST_NUMBER: - break; - MAKE_CASE(ARMISD::Wrapper) - MAKE_CASE(ARMISD::WrapperPIC) - MAKE_CASE(ARMISD::WrapperJT) - MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL) - MAKE_CASE(ARMISD::CALL) - MAKE_CASE(ARMISD::CALL_PRED) - MAKE_CASE(ARMISD::CALL_NOLINK) - MAKE_CASE(ARMISD::tSECALL) - MAKE_CASE(ARMISD::t2CALL_BTI) - MAKE_CASE(ARMISD::BRCOND) - MAKE_CASE(ARMISD::BR_JT) - MAKE_CASE(ARMISD::BR2_JT) - MAKE_CASE(ARMISD::RET_FLAG) - MAKE_CASE(ARMISD::SERET_FLAG) - MAKE_CASE(ARMISD::INTRET_FLAG) - MAKE_CASE(ARMISD::PIC_ADD) - MAKE_CASE(ARMISD::CMP) - MAKE_CASE(ARMISD::CMN) - MAKE_CASE(ARMISD::CMPZ) - MAKE_CASE(ARMISD::CMPFP) - MAKE_CASE(ARMISD::CMPFPE) - MAKE_CASE(ARMISD::CMPFPw0) - MAKE_CASE(ARMISD::CMPFPEw0) - MAKE_CASE(ARMISD::BCC_i64) - MAKE_CASE(ARMISD::FMSTAT) - MAKE_CASE(ARMISD::CMOV) - MAKE_CASE(ARMISD::SUBS) - MAKE_CASE(ARMISD::SSAT) - MAKE_CASE(ARMISD::USAT) - MAKE_CASE(ARMISD::ASRL) - MAKE_CASE(ARMISD::LSRL) - MAKE_CASE(ARMISD::LSLL) - MAKE_CASE(ARMISD::SRL_FLAG) - MAKE_CASE(ARMISD::SRA_FLAG) - MAKE_CASE(ARMISD::RRX) - MAKE_CASE(ARMISD::ADDC) - MAKE_CASE(ARMISD::ADDE) - MAKE_CASE(ARMISD::SUBC) - MAKE_CASE(ARMISD::SUBE) - MAKE_CASE(ARMISD::LSLS) - MAKE_CASE(ARMISD::VMOVRRD) - MAKE_CASE(ARMISD::VMOVDRR) - MAKE_CASE(ARMISD::VMOVhr) - MAKE_CASE(ARMISD::VMOVrh) - MAKE_CASE(ARMISD::VMOVSR) - MAKE_CASE(ARMISD::EH_SJLJ_SETJMP) - MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP) - MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH) - MAKE_CASE(ARMISD::TC_RETURN) - MAKE_CASE(ARMISD::THREAD_POINTER) - MAKE_CASE(ARMISD::DYN_ALLOC) - MAKE_CASE(ARMISD::MEMBARRIER_MCR) - MAKE_CASE(ARMISD::PRELOAD) - MAKE_CASE(ARMISD::LDRD) - MAKE_CASE(ARMISD::STRD) - MAKE_CASE(ARMISD::WIN__CHKSTK) - MAKE_CASE(ARMISD::WIN__DBZCHK) - MAKE_CASE(ARMISD::PREDICATE_CAST) - MAKE_CASE(ARMISD::VECTOR_REG_CAST) - MAKE_CASE(ARMISD::MVESEXT) - MAKE_CASE(ARMISD::MVEZEXT) - MAKE_CASE(ARMISD::MVETRUNC) - MAKE_CASE(ARMISD::VCMP) - MAKE_CASE(ARMISD::VCMPZ) - MAKE_CASE(ARMISD::VTST) - MAKE_CASE(ARMISD::VSHLs) - MAKE_CASE(ARMISD::VSHLu) - MAKE_CASE(ARMISD::VSHLIMM) - MAKE_CASE(ARMISD::VSHRsIMM) - MAKE_CASE(ARMISD::VSHRuIMM) - MAKE_CASE(ARMISD::VRSHRsIMM) - MAKE_CASE(ARMISD::VRSHRuIMM) - MAKE_CASE(ARMISD::VRSHRNIMM) - MAKE_CASE(ARMISD::VQSHLsIMM) - MAKE_CASE(ARMISD::VQSHLuIMM) - MAKE_CASE(ARMISD::VQSHLsuIMM) - MAKE_CASE(ARMISD::VQSHRNsIMM) - MAKE_CASE(ARMISD::VQSHRNuIMM) - MAKE_CASE(ARMISD::VQSHRNsuIMM) - MAKE_CASE(ARMISD::VQRSHRNsIMM) - MAKE_CASE(ARMISD::VQRSHRNuIMM) - MAKE_CASE(ARMISD::VQRSHRNsuIMM) - MAKE_CASE(ARMISD::VSLIIMM) - MAKE_CASE(ARMISD::VSRIIMM) - MAKE_CASE(ARMISD::VGETLANEu) - MAKE_CASE(ARMISD::VGETLANEs) - MAKE_CASE(ARMISD::VMOVIMM) - MAKE_CASE(ARMISD::VMVNIMM) - MAKE_CASE(ARMISD::VMOVFPIMM) - MAKE_CASE(ARMISD::VDUP) - MAKE_CASE(ARMISD::VDUPLANE) - MAKE_CASE(ARMISD::VEXT) - MAKE_CASE(ARMISD::VREV64) - MAKE_CASE(ARMISD::VREV32) - MAKE_CASE(ARMISD::VREV16) - MAKE_CASE(ARMISD::VZIP) - MAKE_CASE(ARMISD::VUZP) - MAKE_CASE(ARMISD::VTRN) - MAKE_CASE(ARMISD::VTBL1) - MAKE_CASE(ARMISD::VTBL2) - MAKE_CASE(ARMISD::VMOVN) - MAKE_CASE(ARMISD::VQMOVNs) - MAKE_CASE(ARMISD::VQMOVNu) - MAKE_CASE(ARMISD::VCVTN) - MAKE_CASE(ARMISD::VCVTL) - MAKE_CASE(ARMISD::VIDUP) - MAKE_CASE(ARMISD::VMULLs) - MAKE_CASE(ARMISD::VMULLu) - MAKE_CASE(ARMISD::VQDMULH) - MAKE_CASE(ARMISD::VADDVs) - MAKE_CASE(ARMISD::VADDVu) - MAKE_CASE(ARMISD::VADDVps) - MAKE_CASE(ARMISD::VADDVpu) - MAKE_CASE(ARMISD::VADDLVs) - MAKE_CASE(ARMISD::VADDLVu) - MAKE_CASE(ARMISD::VADDLVAs) - MAKE_CASE(ARMISD::VADDLVAu) - MAKE_CASE(ARMISD::VADDLVps) - MAKE_CASE(ARMISD::VADDLVpu) - MAKE_CASE(ARMISD::VADDLVAps) - MAKE_CASE(ARMISD::VADDLVApu) - MAKE_CASE(ARMISD::VMLAVs) - MAKE_CASE(ARMISD::VMLAVu) - MAKE_CASE(ARMISD::VMLAVps) - MAKE_CASE(ARMISD::VMLAVpu) - MAKE_CASE(ARMISD::VMLALVs) - MAKE_CASE(ARMISD::VMLALVu) - MAKE_CASE(ARMISD::VMLALVps) - MAKE_CASE(ARMISD::VMLALVpu) - MAKE_CASE(ARMISD::VMLALVAs) - MAKE_CASE(ARMISD::VMLALVAu) - MAKE_CASE(ARMISD::VMLALVAps) - MAKE_CASE(ARMISD::VMLALVApu) - MAKE_CASE(ARMISD::VMINVu) - MAKE_CASE(ARMISD::VMINVs) - MAKE_CASE(ARMISD::VMAXVu) - MAKE_CASE(ARMISD::VMAXVs) - MAKE_CASE(ARMISD::UMAAL) - MAKE_CASE(ARMISD::UMLAL) - MAKE_CASE(ARMISD::SMLAL) - MAKE_CASE(ARMISD::SMLALBB) - MAKE_CASE(ARMISD::SMLALBT) - MAKE_CASE(ARMISD::SMLALTB) - MAKE_CASE(ARMISD::SMLALTT) - MAKE_CASE(ARMISD::SMULWB) - MAKE_CASE(ARMISD::SMULWT) - MAKE_CASE(ARMISD::SMLALD) - MAKE_CASE(ARMISD::SMLALDX) - MAKE_CASE(ARMISD::SMLSLD) - MAKE_CASE(ARMISD::SMLSLDX) - MAKE_CASE(ARMISD::SMMLAR) - MAKE_CASE(ARMISD::SMMLSR) - MAKE_CASE(ARMISD::QADD16b) - MAKE_CASE(ARMISD::QSUB16b) - MAKE_CASE(ARMISD::QADD8b) - MAKE_CASE(ARMISD::QSUB8b) - MAKE_CASE(ARMISD::UQADD16b) - MAKE_CASE(ARMISD::UQSUB16b) - MAKE_CASE(ARMISD::UQADD8b) - MAKE_CASE(ARMISD::UQSUB8b) - MAKE_CASE(ARMISD::BUILD_VECTOR) - MAKE_CASE(ARMISD::BFI) - MAKE_CASE(ARMISD::VORRIMM) - MAKE_CASE(ARMISD::VBICIMM) - MAKE_CASE(ARMISD::VBSP) - MAKE_CASE(ARMISD::MEMCPY) - MAKE_CASE(ARMISD::VLD1DUP) - MAKE_CASE(ARMISD::VLD2DUP) - MAKE_CASE(ARMISD::VLD3DUP) - MAKE_CASE(ARMISD::VLD4DUP) - MAKE_CASE(ARMISD::VLD1_UPD) - MAKE_CASE(ARMISD::VLD2_UPD) - MAKE_CASE(ARMISD::VLD3_UPD) - MAKE_CASE(ARMISD::VLD4_UPD) - MAKE_CASE(ARMISD::VLD1x2_UPD) - MAKE_CASE(ARMISD::VLD1x3_UPD) - MAKE_CASE(ARMISD::VLD1x4_UPD) - MAKE_CASE(ARMISD::VLD2LN_UPD) - MAKE_CASE(ARMISD::VLD3LN_UPD) - MAKE_CASE(ARMISD::VLD4LN_UPD) - MAKE_CASE(ARMISD::VLD1DUP_UPD) - MAKE_CASE(ARMISD::VLD2DUP_UPD) - MAKE_CASE(ARMISD::VLD3DUP_UPD) - MAKE_CASE(ARMISD::VLD4DUP_UPD) - MAKE_CASE(ARMISD::VST1_UPD) - MAKE_CASE(ARMISD::VST2_UPD) - MAKE_CASE(ARMISD::VST3_UPD) - MAKE_CASE(ARMISD::VST4_UPD) - MAKE_CASE(ARMISD::VST1x2_UPD) - MAKE_CASE(ARMISD::VST1x3_UPD) - MAKE_CASE(ARMISD::VST1x4_UPD) - MAKE_CASE(ARMISD::VST2LN_UPD) - MAKE_CASE(ARMISD::VST3LN_UPD) - MAKE_CASE(ARMISD::VST4LN_UPD) - MAKE_CASE(ARMISD::WLS) - MAKE_CASE(ARMISD::WLSSETUP) - MAKE_CASE(ARMISD::LE) - MAKE_CASE(ARMISD::LOOP_DEC) - MAKE_CASE(ARMISD::CSINV) - MAKE_CASE(ARMISD::CSNEG) - MAKE_CASE(ARMISD::CSINC) - MAKE_CASE(ARMISD::MEMCPYLOOP) - MAKE_CASE(ARMISD::MEMSETLOOP) -#undef MAKE_CASE + case ARMISD::FIRST_NUMBER: break; + case ARMISD::Wrapper: return "ARMISD::Wrapper"; + case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; + case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; + case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; + case ARMISD::CALL: return "ARMISD::CALL"; + case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; + case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; + case ARMISD::BRCOND: return "ARMISD::BRCOND"; + case ARMISD::BR_JT: return "ARMISD::BR_JT"; + case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; + case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; + case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; + case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; + case ARMISD::CMP: return "ARMISD::CMP"; + case ARMISD::CMN: return "ARMISD::CMN"; + case ARMISD::CMPZ: return "ARMISD::CMPZ"; + case ARMISD::CMPFP: return "ARMISD::CMPFP"; + case ARMISD::CMPFPE: return "ARMISD::CMPFPE"; + case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; + case ARMISD::CMPFPEw0: return "ARMISD::CMPFPEw0"; + case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; + case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; + + case ARMISD::CMOV: return "ARMISD::CMOV"; + case ARMISD::SUBS: return "ARMISD::SUBS"; + + case ARMISD::SSAT: return "ARMISD::SSAT"; + case ARMISD::USAT: return "ARMISD::USAT"; + + case ARMISD::ASRL: return "ARMISD::ASRL"; + case ARMISD::LSRL: return "ARMISD::LSRL"; + case ARMISD::LSLL: return "ARMISD::LSLL"; + + case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; + case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; + case ARMISD::RRX: return "ARMISD::RRX"; + + case ARMISD::ADDC: return "ARMISD::ADDC"; + case ARMISD::ADDE: return "ARMISD::ADDE"; + case ARMISD::SUBC: return "ARMISD::SUBC"; + case ARMISD::SUBE: return "ARMISD::SUBE"; + case ARMISD::LSLS: return "ARMISD::LSLS"; + + case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; + case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; + case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; + case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; + case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; + + case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; + case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; + + case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; + + case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; + + case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; + + case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; + + case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; + + case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; + case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; + + case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; + case ARMISD::VCMP: return "ARMISD::VCMP"; + case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; + case ARMISD::VTST: return "ARMISD::VTST"; + + case ARMISD::VSHLs: return "ARMISD::VSHLs"; + case ARMISD::VSHLu: return "ARMISD::VSHLu"; + case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM"; + case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM"; + case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM"; + case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM"; + case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM"; + case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM"; + case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM"; + case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM"; + case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM"; + case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM"; + case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM"; + case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM"; + case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM"; + case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM"; + case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM"; + case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM"; + case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM"; + case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; + case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; + case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; + case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; + case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; + case ARMISD::VDUP: return "ARMISD::VDUP"; + case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; + case ARMISD::VEXT: return "ARMISD::VEXT"; + case ARMISD::VREV64: return "ARMISD::VREV64"; + case ARMISD::VREV32: return "ARMISD::VREV32"; + case ARMISD::VREV16: return "ARMISD::VREV16"; + case ARMISD::VZIP: return "ARMISD::VZIP"; + case ARMISD::VUZP: return "ARMISD::VUZP"; + case ARMISD::VTRN: return "ARMISD::VTRN"; + case ARMISD::VTBL1: return "ARMISD::VTBL1"; + case ARMISD::VTBL2: return "ARMISD::VTBL2"; + case ARMISD::VMOVN: return "ARMISD::VMOVN"; + case ARMISD::VMULLs: return "ARMISD::VMULLs"; + case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::UMAAL: return "ARMISD::UMAAL"; + case ARMISD::UMLAL: return "ARMISD::UMLAL"; + case ARMISD::SMLAL: return "ARMISD::SMLAL"; + case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; + case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; + case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; + case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; + case ARMISD::SMULWB: return "ARMISD::SMULWB"; + case ARMISD::SMULWT: return "ARMISD::SMULWT"; + case ARMISD::SMLALD: return "ARMISD::SMLALD"; + case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; + case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; + case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; + case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; + case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; + case ARMISD::QADD16b: return "ARMISD::QADD16b"; + case ARMISD::QSUB16b: return "ARMISD::QSUB16b"; + case ARMISD::QADD8b: return "ARMISD::QADD8b"; + case ARMISD::QSUB8b: return "ARMISD::QSUB8b"; + case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; + case ARMISD::BFI: return "ARMISD::BFI"; + case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; + case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; + case ARMISD::VBSL: return "ARMISD::VBSL"; + case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; + case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; + case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; + case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; + case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; + case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; + case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; + case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; + case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; + case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; + case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; + case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; + case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; + case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; + case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; + case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; + case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; + case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; + case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; + case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; + case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; + case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; + case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; + case ARMISD::WLS: return "ARMISD::WLS"; + case ARMISD::LE: return "ARMISD::LE"; + case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC"; + case ARMISD::CSINV: return "ARMISD::CSINV"; + case ARMISD::CSNEG: return "ARMISD::CSNEG"; + case ARMISD::CSINC: return "ARMISD::CSINC"; } return nullptr; } @@ -1884,11 +1715,8 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, return getPointerTy(DL); // MVE has a predicate register. - if ((Subtarget->hasMVEIntegerOps() && - (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || - VT == MVT::v16i8)) || - (Subtarget->hasMVEFloatOps() && - (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16))) + if (Subtarget->hasMVEIntegerOps() && + (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); return VT.changeVectorElementTypeToInteger(); } @@ -1902,18 +1730,12 @@ ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive // MVE Q registers. - if (Subtarget->hasNEON()) { + if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { if (VT == MVT::v4i64) return &ARM::QQPRRegClass; if (VT == MVT::v8i64) return &ARM::QQQQPRRegClass; } - if (Subtarget->hasMVEIntegerOps()) { - if (VT == MVT::v4i64) - return &ARM::MQQPRRegClass; - if (VT == MVT::v8i64) - return &ARM::MQQQQPRRegClass; - } return TargetLowering::getRegClassFor(VT); } @@ -1921,14 +1743,13 @@ ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { // source/dest is aligned and the copy size is large enough. We therefore want // to align such objects passed to memory intrinsics. bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, - Align &PrefAlign) const { + unsigned &PrefAlign) const { if (!isa<MemIntrinsic>(CI)) return false; MinSize = 8; // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 // cycle faster than 4-byte aligned LDM. - PrefAlign = - (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4)); + PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); return true; } @@ -2075,10 +1896,8 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, return CallingConv::PreserveMost; case CallingConv::ARM_AAPCS_VFP: case CallingConv::Swift: - case CallingConv::SwiftTail: return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; case CallingConv::C: - case CallingConv::Tail: if (!Subtarget->isAAPCS_ABI()) return CallingConv::ARM_APCS; else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && @@ -2136,35 +1955,6 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, } } -SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, - MVT LocVT, MVT ValVT, SDValue Val) const { - Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()), - Val); - if (Subtarget->hasFullFP16()) { - Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val); - } else { - Val = DAG.getNode(ISD::TRUNCATE, dl, - MVT::getIntegerVT(ValVT.getSizeInBits()), Val); - Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val); - } - return Val; -} - -SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, - MVT LocVT, MVT ValVT, - SDValue Val) const { - if (Subtarget->hasFullFP16()) { - Val = DAG.getNode(ARMISD::VMOVrh, dl, - MVT::getIntegerVT(LocVT.getSizeInBits()), Val); - } else { - Val = DAG.getNode(ISD::BITCAST, dl, - MVT::getIntegerVT(ValVT.getSizeInBits()), Val); - Val = DAG.getNode(ISD::ZERO_EXTEND, dl, - MVT::getIntegerVT(LocVT.getSizeInBits()), Val); - } - return DAG.getNode(ISD::BITCAST, dl, LocVT, Val); -} - /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue ARMTargetLowering::LowerCallResult( @@ -2192,8 +1982,7 @@ SDValue ARMTargetLowering::LowerCallResult( } SDValue Val; - if (VA.needsCustom() && - (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) { + if (VA.needsCustom()) { // Handle f64 or half of a v2f64. SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); @@ -2242,44 +2031,25 @@ SDValue ARMTargetLowering::LowerCallResult( break; } - // f16 arguments have their size extended to 4 bytes and passed as if they - // had been copied to the LSBs of a 32-bit register. - // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) - if (VA.needsCustom() && - (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) - Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val); - InVals.push_back(Val); } return Chain; } -std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg( - const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr, - bool IsTailCall, int SPDiff) const { - SDValue DstAddr; - MachinePointerInfo DstInfo; - int32_t Offset = VA.getLocMemOffset(); - MachineFunction &MF = DAG.getMachineFunction(); - - if (IsTailCall) { - Offset += SPDiff; - auto PtrVT = getPointerTy(DAG.getDataLayout()); - int Size = VA.getLocVT().getFixedSizeInBits() / 8; - int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); - DstAddr = DAG.getFrameIndex(FI, PtrVT); - DstInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); - } else { - SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl); - DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), - StackPtr, PtrOff); - DstInfo = - MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset); - } - - return std::make_pair(DstAddr, DstInfo); +/// LowerMemOpCallTo - Store the argument to the stack. +SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, + SDValue Arg, const SDLoc &dl, + SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const { + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, PtrOff); + return DAG.getStore( + Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); } void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, @@ -2288,8 +2058,7 @@ void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVectorImpl<SDValue> &MemOpChains, - bool IsTailCall, - int SPDiff) const { + ISD::ArgFlagsTy Flags) const { SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); unsigned id = Subtarget->isLittle() ? 0 : 1; @@ -2303,20 +2072,12 @@ void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); - SDValue DstAddr; - MachinePointerInfo DstInfo; - std::tie(DstAddr, DstInfo) = - computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff); - MemOpChains.push_back( - DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo)); + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), + dl, DAG, NextVA, + Flags)); } } -static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { - return (CC == CallingConv::Fast && GuaranteeTailCalls) || - CC == CallingConv::Tail || CC == CallingConv::SwiftTail; -} - /// LowerCall - Lowering a call into a callseq_start <- /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter /// nodes. @@ -2336,41 +2097,22 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); MachineFunction::CallSiteInfo CSInfo; bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; - bool isCmseNSCall = false; - bool isSibCall = false; bool PreferIndirect = false; - bool GuardWithBTI = false; - - // Lower 'returns_twice' calls to a pseudo-instruction. - if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) && - !Subtarget->noBTIAtReturnTwice()) - GuardWithBTI = AFI->branchTargetEnforcement(); - - // Determine whether this is a non-secure function call. - if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call")) - isCmseNSCall = true; // Disable tail calls if they're not supported. if (!Subtarget->supportsTailCall()) isTailCall = false; - // For both the non-secure calls and the returns from a CMSE entry function, - // the function needs to do some extra work afte r the call, or before the - // return, respectively, thus it cannot end with atail call - if (isCmseNSCall || AFI->isCmseNSEntryFunction()) - isTailCall = false; - if (isa<GlobalAddressSDNode>(Callee)) { // If we're optimizing for minimum size and the function is called three or // more times in this block, we can improve codesize by calling indirectly // as BLXr has a 16-bit encoding. auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); - if (CLI.CB) { - auto *BB = CLI.CB->getParent(); + if (CLI.CS) { + auto *BB = CLI.CS.getParent(); PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && count_if(GV->users(), [&BB](const User *U) { return isa<Instruction>(U) && @@ -2384,20 +2126,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Callee, CallConv, isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, PreferIndirect); - - if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt && - CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail) - isSibCall = true; - + if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) + report_fatal_error("failed to perform tail call elimination on a call " + "site marked musttail"); // We don't support GuaranteedTailCallOpt for ARM, only automatically // detected sibcalls. if (isTailCall) ++NumTailCalls; } - if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall()) - report_fatal_error("failed to perform tail call elimination on a call " - "site marked musttail"); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, @@ -2407,40 +2144,13 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); - // SPDiff is the byte offset of the call's argument area from the callee's. - // Stores to callee stack arguments will be placed in FixedStackSlots offset - // by this amount for a tail call. In a sibling call it must be 0 because the - // caller will deallocate the entire stack and the callee still expects its - // arguments to begin at SP+0. Completely unused for non-tail calls. - int SPDiff = 0; - - if (isTailCall && !isSibCall) { - auto FuncInfo = MF.getInfo<ARMFunctionInfo>(); - unsigned NumReusableBytes = FuncInfo->getArgumentStackSize(); - - // Since callee will pop argument stack as a tail call, we must keep the - // popped size 16-byte aligned. - Align StackAlign = DAG.getDataLayout().getStackAlignment(); - NumBytes = alignTo(NumBytes, StackAlign); - - // SPDiff will be negative if this tail call requires more space than we - // would automatically have in our incoming argument space. Positive if we - // can actually shrink the stack. - SPDiff = NumReusableBytes - NumBytes; - - // If this call requires more stack than we have available from - // LowerFormalArguments, tell FrameLowering to reserve space for it. - if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff) - AFI->setArgRegsSaveSize(-SPDiff); - } - - if (isSibCall) { - // For sibling tail calls, memory operands are available in our caller's stack. + if (isTailCall) { + // For tail calls, memory operands are available in our caller's stack. NumBytes = 0; } else { // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); } SDValue StackPtr = @@ -2449,13 +2159,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPassVector RegsToPass; SmallVector<SDValue, 8> MemOpChains; - // During a tail call, stores to the argument area must happen after all of - // the function's incoming arguments have been loaded because they may alias. - // This is done by folding in a TokenFactor from LowerFormalArguments, but - // there's no point in doing so repeatedly so this tracks whether that's - // happened yet. - bool AfterFormalArgLoads = false; - // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization, arguments are handled later. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); @@ -2484,57 +2187,31 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, break; } - if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) { - Chain = DAG.getStackArgumentTokenFactor(Chain); - AfterFormalArgLoads = true; - } + // f64 and v2f64 might be passed in i32 pairs and must be split into pieces + if (VA.needsCustom()) { + if (VA.getLocVT() == MVT::v2f64) { + SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, dl, MVT::i32)); + SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, dl, MVT::i32)); - // f16 arguments have their size extended to 4 bytes and passed as if they - // had been copied to the LSBs of a 32-bit register. - // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) - if (VA.needsCustom() && - (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { - Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); - } else { - // f16 arguments could have been extended prior to argument lowering. - // Mask them arguments if this is a CMSE nonsecure call. - auto ArgVT = Outs[realArgIdx].ArgVT; - if (isCmseNSCall && (ArgVT == MVT::f16)) { - auto LocBits = VA.getLocVT().getSizeInBits(); - auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits()); - SDValue Mask = - DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); - Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); - Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); - Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); - } - } + PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); - // f64 and v2f64 might be passed in i32 pairs and must be split into pieces - if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { - SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, - DAG.getConstant(0, dl, MVT::i32)); - SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, - DAG.getConstant(1, dl, MVT::i32)); - - PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], - StackPtr, MemOpChains, isTailCall, SPDiff); - - VA = ArgLocs[++i]; // skip ahead to next loc - if (VA.isRegLoc()) { - PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], - StackPtr, MemOpChains, isTailCall, SPDiff); + VA = ArgLocs[++i]; // skip ahead to next loc + if (VA.isRegLoc()) { + PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + } else { + assert(VA.isMemLoc()); + + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, + dl, DAG, VA, Flags)); + } } else { - assert(VA.isMemLoc()); - SDValue DstAddr; - MachinePointerInfo DstInfo; - std::tie(DstAddr, DstInfo) = - computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); - MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo)); + PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], + StackPtr, MemOpChains, Flags); } - } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { - PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], - StackPtr, MemOpChains, isTailCall, SPDiff); } else if (VA.isRegLoc()) { if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && Outs[0].VT == MVT::i32) { @@ -2545,7 +2222,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, isThisReturn = true; } const TargetOptions &Options = DAG.getTarget().Options; - if (Options.EmitCallSiteInfo) + if (Options.EnableDebugEntryValues) CSInfo.emplace_back(VA.getLocReg(), i); RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else if (isByVal) { @@ -2568,9 +2245,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); - SDValue Load = - DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), - DAG.InferPtrAlign(AddArg)); + SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, + MachinePointerInfo(), + DAG.InferPtrAlignment(AddArg)); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(j, Load)); } @@ -2584,31 +2261,26 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Flags.getByValSize() > 4*offset) { auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Dst; - MachinePointerInfo DstInfo; - std::tie(Dst, DstInfo) = - computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); + SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, MVT::i32); - SDValue AlignNode = - DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32); + SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, + MVT::i32); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops)); } - } else { + } else if (!isTailCall) { assert(VA.isMemLoc()); - SDValue DstAddr; - MachinePointerInfo DstInfo; - std::tie(DstAddr, DstInfo) = - computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff); - SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo); - MemOpChains.push_back(Store); + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, + dl, DAG, VA, Flags)); } } @@ -2631,14 +2303,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const TargetMachine &TM = getTargetMachine(); const Module *Mod = MF.getFunction().getParent(); - const GlobalValue *GVal = nullptr; + const GlobalValue *GV = nullptr; if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) - GVal = G->getGlobal(); + GV = G->getGlobal(); bool isStub = - !TM.shouldAssumeDSOLocal(*Mod, GVal) && Subtarget->isTargetMachO(); + !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); bool isLocalARMFunc = false; + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); auto PtrVt = getPointerTy(DAG.getDataLayout()); if (Subtarget->genLongCalls()) { @@ -2648,58 +2321,36 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // those, the target's already in a register, so we don't need to do // anything extra. if (isa<GlobalAddressSDNode>(Callee)) { - // When generating execute-only code we use movw movt pair. - // Currently execute-only is only available for architectures that - // support movw movt, so we are safe to assume that. - if (Subtarget->genExecuteOnly()) { - assert(Subtarget->useMovt() && - "long-calls with execute-only requires movt and movw!"); - ++NumMovwMovt; - Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt, - DAG.getTargetGlobalAddress(GVal, dl, PtrVt)); - } else { - // Create a constant pool entry for the callee address - unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( - GVal, ARMPCLabelIndex, ARMCP::CPValue, 0); - - // Get the address of the callee into a register - SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); - Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr); - Callee = DAG.getLoad( - PtrVt, dl, DAG.getEntryNode(), Addr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); - } + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); + + // Get the address of the callee into a register + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { const char *Sym = S->getSymbol(); - // When generating execute-only code we use movw movt pair. - // Currently execute-only is only available for architectures that - // support movw movt, so we are safe to assume that. - if (Subtarget->genExecuteOnly()) { - assert(Subtarget->useMovt() && - "long-calls with execute-only requires movt and movw!"); - ++NumMovwMovt; - Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt, - DAG.getTargetGlobalAddress(GVal, dl, PtrVt)); - } else { - // Create a constant pool entry for the callee address - unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create( - *DAG.getContext(), Sym, ARMPCLabelIndex, 0); - - // Get the address of the callee into a register - SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); - Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr); - Callee = DAG.getLoad( - PtrVt, dl, DAG.getEntryNode(), Addr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); - } + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, + ARMPCLabelIndex, 0); + // Get the address of the callee into a register + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } } else if (isa<GlobalAddressSDNode>(Callee)) { if (!PreferIndirect) { isDirect = true; - bool isDef = GVal->isStrongDefinitionForLinker(); + bool isDef = GV->isStrongDefinitionForLinker(); // ARM call to a local ARM function is predicable. isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); @@ -2708,21 +2359,21 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); Callee = DAG.getNode( ARMISD::WrapperPIC, dl, PtrVt, - DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY)); + DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(), - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + /* Alignment = */ 0, MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); unsigned TargetFlags = ARMII::MO_NO_FLAG; - if (GVal->hasDLLImportStorageClass()) + if (GV->hasDLLImportStorageClass()) TargetFlags = ARMII::MO_DLLIMPORT; - else if (!TM.shouldAssumeDSOLocal(*GVal->getParent(), GVal)) + else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) TargetFlags = ARMII::MO_COFFSTUB; - Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0, + Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, TargetFlags); if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) Callee = @@ -2730,7 +2381,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), MachinePointerInfo::getGOT(DAG.getMachineFunction())); } else { - Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0); + Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); } } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { @@ -2742,7 +2393,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 4); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, @@ -2754,33 +2405,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } - if (isCmseNSCall) { - assert(!isARMFunc && !isDirect && - "Cannot handle call to ARM function or direct call"); - if (NumBytes > 0) { - DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(), - "call to non-secure function would " - "require passing arguments on stack", - dl.getDebugLoc()); - DAG.getContext()->diagnose(Diag); - } - if (isStructRet) { - DiagnosticInfoUnsupported Diag( - DAG.getMachineFunction().getFunction(), - "call to non-secure function would return value through pointer", - dl.getDebugLoc()); - DAG.getContext()->diagnose(Diag); - } - } - // FIXME: handle tail calls differently. unsigned CallOpc; if (Subtarget->isThumb()) { - if (GuardWithBTI) - CallOpc = ARMISD::t2CALL_BTI; - else if (isCmseNSCall) - CallOpc = ARMISD::tSECALL; - else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) + if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else CallOpc = ARMISD::CALL; @@ -2796,23 +2424,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; } - // We don't usually want to end the call-sequence here because we would tidy - // the frame up *after* the call, however in the ABI-changing tail-call case - // we've carefully laid out the parameters so that when sp is reset they'll be - // in the correct location. - if (isTailCall && !isSibCall) { - Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, dl); - InFlag = Chain.getValue(1); - } - std::vector<SDValue> Ops; Ops.push_back(Chain); Ops.push_back(Callee); - if (isTailCall) { - Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32)); - } - // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) @@ -2820,23 +2435,25 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const uint32_t *Mask; - const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); - if (isThisReturn) { - // For 'this' returns, use the R0-preserving mask if applicable - Mask = ARI->getThisReturnPreservedMask(MF, CallConv); - if (!Mask) { - // Set isThisReturn to false if the calling convention is not one that - // allows 'returned' to be modeled in this way, so LowerCallResult does - // not try to pass 'this' straight through - isThisReturn = false; + if (!isTailCall) { + const uint32_t *Mask; + const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); + if (isThisReturn) { + // For 'this' returns, use the R0-preserving mask if applicable + Mask = ARI->getThisReturnPreservedMask(MF, CallConv); + if (!Mask) { + // Set isThisReturn to false if the calling convention is not one that + // allows 'returned' to be modeled in this way, so LowerCallResult does + // not try to pass 'this' straight through + isThisReturn = false; + Mask = ARI->getCallPreservedMask(MF, CallConv); + } + } else Mask = ARI->getCallPreservedMask(MF, CallConv); - } - } else - Mask = ARI->getCallPreservedMask(MF, CallConv); - assert(Mask && "Missing call preserved mask for calling convention"); - Ops.push_back(DAG.getRegisterMask(Mask)); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + } if (InFlag.getNode()) Ops.push_back(InFlag); @@ -2851,18 +2468,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); - DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); - // If we're guaranteeing tail-calls will be honoured, the callee must - // pop its own argument stack on return. But this call is *not* a tail call so - // we need to undo that after it returns to restore the status-quo. - bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; - uint64_t CalleePopBytes = - canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL; - - Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, dl); + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), + DAG.getIntPtrConstant(0, dl, true), InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); @@ -2878,15 +2488,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, /// and then confiscate the rest of the parameter registers to insure /// this. void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, - Align Alignment) const { + unsigned Align) const { // Byval (as with any stack) slots are always at least 4 byte aligned. - Alignment = std::max(Alignment, Align(4)); + Align = std::max(Align, 4U); unsigned Reg = State->AllocateReg(GPRArgRegs); if (!Reg) return; - unsigned AlignInRegs = Alignment.value() / 4; + unsigned AlignInRegs = Align / 4; unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; for (unsigned i = 0; i < Waste; ++i) Reg = State->AllocateReg(GPRArgRegs); @@ -2937,8 +2547,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, unsigned Bytes = Arg.getValueSizeInBits() / 8; int FI = std::numeric_limits<int>::max(); if (Arg.getOpcode() == ISD::CopyFromReg) { - Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); - if (!VR.isVirtual()) + unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); + if (!Register::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) @@ -2990,17 +2600,9 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( // Indirect tail calls cannot be optimized for Thumb1 if the args // to the call take up r0-r3. The reason is that there are no legal registers // left to hold the pointer to the function to be called. - // Similarly, if the function uses return address sign and authentication, - // r12 is needed to hold the PAC and is not available to hold the callee - // address. - if (Outs.size() >= 4 && - (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) { - if (Subtarget->isThumb1Only()) - return false; - // Conservatively assume the function spills LR. - if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)) - return false; - } + if (Subtarget->isThumb1Only() && Outs.size() >= 4 && + (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) + return false; // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. @@ -3011,9 +2613,6 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( if (CallerF.hasFnAttribute("interrupt")) return false; - if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) - return CalleeCC == CallerCC; - // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) @@ -3036,11 +2635,9 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( // Check that the call results are passed in the same way. LLVMContext &C = *DAG.getContext(); - if (!CCState::resultsCompatible( - getEffectiveCallingConv(CalleeCC, isVarArg), - getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins, - CCAssignFnForReturn(CalleeCC, isVarArg), - CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) + if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, + CCAssignFnForReturn(CalleeCC, isVarArg), + CCAssignFnForReturn(CallerCC, isVarArg))) return false; // The callee has to preserve all registers the caller needs to preserve. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); @@ -3081,7 +2678,7 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; - if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) { + if (VA.needsCustom()) { // f64 and vector types are split into multiple registers or // register/stack-slot combinations. The types will not match // the registers; give up on memory f64 refs until we figure @@ -3180,17 +2777,6 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); AFI->setReturnRegsCount(RVLocs.size()); - // Report error if cmse entry function returns structure through first ptr arg. - if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) { - // Note: using an empty SDLoc(), as the first line of the function is a - // better place to report than the last line. - DiagnosticInfoUnsupported Diag( - DAG.getMachineFunction().getFunction(), - "secure entry function would return value through pointer", - SDLoc().getDebugLoc()); - DAG.getContext()->diagnose(Diag); - } - // Copy the result values into the output registers. for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); @@ -3233,24 +2819,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, break; } - // Mask f16 arguments if this is a CMSE nonsecure entry. - auto RetVT = Outs[realRVLocIdx].ArgVT; - if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) { - if (VA.needsCustom() && VA.getValVT() == MVT::f16) { - Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); - } else { - auto LocBits = VA.getLocVT().getSizeInBits(); - auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits()); - SDValue Mask = - DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); - Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); - Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); - Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); - } - } - - if (VA.needsCustom() && - (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) { + if (VA.needsCustom()) { if (VA.getLocVT() == MVT::v2f64) { // Extract the first half and return it in two registers. SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, @@ -3258,15 +2827,15 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Half); - Chain = - DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag); + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(isLittleEndian ? 0 : 1), + Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc - Chain = - DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag); + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(isLittleEndian ? 1 : 0), + Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc @@ -3280,20 +2849,22 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - fmrrd.getValue(isLittleEndian ? 0 : 1), Flag); + fmrrd.getValue(isLittleEndian ? 0 : 1), + Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - fmrrd.getValue(isLittleEndian ? 1 : 0), Flag); + fmrrd.getValue(isLittleEndian ? 1 : 0), + Flag); } else Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); // Guarantee that all emitted copies are // stuck together, avoiding something bad. Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister( - VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT())); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), + ReturnF16 ? MVT::f16 : VA.getLocVT())); } const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = @@ -3327,9 +2898,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, return LowerInterruptReturn(RetOps, dl, DAG); } - ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG : - ARMISD::RET_FLAG; - return DAG.getNode(RetNode, dl, MVT::Other, RetOps); + return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); } bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { @@ -3350,24 +2919,26 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { SDNode *VMov = Copy; // f64 returned in a pair of GPRs. SmallPtrSet<SDNode*, 2> Copies; - for (SDNode *U : VMov->uses()) { - if (U->getOpcode() != ISD::CopyToReg) + for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != ISD::CopyToReg) return false; - Copies.insert(U); + Copies.insert(*UI); } if (Copies.size() > 2) return false; - for (SDNode *U : VMov->uses()) { - SDValue UseChain = U->getOperand(0); + for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); + UI != UE; ++UI) { + SDValue UseChain = UI->getOperand(0); if (Copies.count(UseChain.getNode())) // Second CopyToReg - Copy = U; + Copy = *UI; else { // We are at the top of this chain. // If the copy has a glue operand, we conservatively assume it // isn't safe to perform a tail call. - if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue) + if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) return false; // First CopyToReg TCChain = UseChain; @@ -3390,9 +2961,10 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { } bool HasRet = false; - for (const SDNode *U : Copy->uses()) { - if (U->getOpcode() != ARMISD::RET_FLAG && - U->getOpcode() != ARMISD::INTRET_FLAG) + for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != ARMISD::RET_FLAG && + UI->getOpcode() != ARMISD::INTRET_FLAG) return false; HasRet = true; } @@ -3467,16 +3039,12 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, return LowerGlobalAddress(GA, DAG); } - // The 16-bit ADR instruction can only encode offsets that are multiples of 4, - // so we need to align to at least 4 bytes when we don't have 32-bit ADR. - Align CPAlign = CP->getAlign(); - if (Subtarget->isThumb1Only()) - CPAlign = std::max(CPAlign, Align(4)); if (CP->isMachineConstantPoolEntry()) - Res = - DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign); + Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, + CP->getAlignment()); else - Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign); + Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, + CP->getAlignment()); return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); } @@ -3495,14 +3063,14 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SDValue CPAddr; bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); if (!IsPositionIndependent) { - CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4)); + CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); } else { unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, ARMCP::CPBlockAddress, PCAdj); - CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); SDValue Result = DAG.getLoad( @@ -3554,7 +3122,8 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad( MVT::i32, DL, Chain, DescAddr, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4), + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + /* Alignment = */ 4, MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); Chain = FuncTLVGet.getValue(1); @@ -3630,9 +3199,8 @@ ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, const auto *GA = cast<GlobalAddressSDNode>(Op); auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); SDValue Offset = DAG.getLoad( - PtrVT, DL, Chain, - DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, - DAG.getTargetConstantPool(CPV, PtrVT, Align(4))), + PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, + DAG.getTargetConstantPool(CPV, PtrVT, 4)), MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); @@ -3651,7 +3219,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); - SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); + SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); Argument = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), Argument, @@ -3702,7 +3270,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true); - Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, @@ -3720,7 +3288,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, assert(model == TLSModel::LocalExec); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); - Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, @@ -3762,11 +3330,14 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { /// Return true if all users of V are within function F, looking through /// ConstantExprs. static bool allUsersAreInFunction(const Value *V, const Function *F) { - SmallVector<const User*,4> Worklist(V->users()); + SmallVector<const User*,4> Worklist; + for (auto *U : V->users()) + Worklist.push_back(U); while (!Worklist.empty()) { auto *U = Worklist.pop_back_val(); if (isa<ConstantExpr>(U)) { - append_range(Worklist, U->users()); + for (auto *UU : U->users()) + Worklist.push_back(UU); continue; } @@ -3809,7 +3380,7 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, // from .data to .text. This is not allowed in position-independent code. auto *Init = GVar->getInitializer(); if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && - Init->needsDynamicRelocation()) + Init->needsRelocation()) return SDValue(); // The constant islands pass can only really deal with alignment requests @@ -3820,11 +3391,11 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, // that are strings for simplicity. auto *CDAInit = dyn_cast<ConstantDataArray>(Init); unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); - Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar); + unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); unsigned RequiredPadding = 4 - (Size % 4); bool PaddingPossible = RequiredPadding == 4 || (CDAInit && CDAInit->isString()); - if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize || + if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || Size == 0) return SDValue(); @@ -3863,7 +3434,8 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, } auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); - SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4)); + SDValue CPAddr = + DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { AFI->markGlobalAsPromotedToConstantPool(GVar); AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + @@ -3875,7 +3447,7 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - if (!(GV = GA->getAliaseeObject())) + if (!(GV = GA->getBaseObject())) return false; if (const auto *V = dyn_cast<GlobalVariable>(GV)) return V->isConstant(); @@ -3933,7 +3505,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, } else { // use literal pool for address constant ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); RelAddr = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, @@ -3953,7 +3525,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, DAG.getTargetGlobalAddress(GV, dl, PtrVT)); } else { - SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4)); + SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); return DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, @@ -4061,10 +3633,10 @@ SDValue ARMTargetLowering::LowerINTRINSIC_VOID( ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); assert(Mask && "Missing call preserved mask for calling convention"); // Mark LR an implicit live-in. - Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); + unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); SDValue ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); - constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue}; + std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue}; SDValue Callee = DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); SDValue RegisterMask = DAG.getRegisterMask(Mask); @@ -4148,7 +3720,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, ARMCP::CPLSDA, PCAdj); - CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, @@ -4210,15 +3782,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::arm_mve_pred_v2i: return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), Op.getOperand(1)); - case Intrinsic::arm_mve_vreinterpretq: - return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(), - Op.getOperand(1)); - case Intrinsic::arm_mve_lsll: - return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::arm_mve_asrl: - return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } @@ -4315,7 +3878,7 @@ SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, RC = &ARM::GPRRegClass; // Transform the arguments stored in physical registers into virtual ones. - Register Reg = MF.addLiveIn(VA.getLocReg(), RC); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); SDValue ArgValue2; @@ -4385,7 +3948,7 @@ int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { - Register VReg = MF.addLiveIn(Reg, RC); + unsigned VReg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(OrigArg, 4 * i)); @@ -4419,42 +3982,6 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, AFI->setVarArgsFrameIndex(FrameIndex); } -bool ARMTargetLowering::splitValueIntoRegisterParts( - SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, - unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const { - bool IsABIRegCopy = CC.has_value(); - EVT ValueVT = Val.getValueType(); - if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && - PartVT == MVT::f32) { - unsigned ValueBits = ValueVT.getSizeInBits(); - unsigned PartBits = PartVT.getSizeInBits(); - Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); - Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); - Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); - Parts[0] = Val; - return true; - } - return false; -} - -SDValue ARMTargetLowering::joinRegisterPartsIntoValue( - SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, - MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const { - bool IsABIRegCopy = CC.has_value(); - if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && - PartVT == MVT::f32) { - unsigned ValueBits = ValueVT.getSizeInBits(); - unsigned PartBits = PartVT.getSizeInBits(); - SDValue Val = Parts[0]; - - Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); - Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); - Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); - return Val; - } - return SDValue(); -} - SDValue ARMTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, @@ -4508,7 +4035,7 @@ SDValue ARMTargetLowering::LowerFormalArguments( int lastInsIndex = -1; if (isVarArg && MFI.hasVAStart()) { unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); - if (RegIdx != std::size(GPRArgRegs)) + if (RegIdx != array_lengthof(GPRArgRegs)) ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); } @@ -4527,41 +4054,44 @@ SDValue ARMTargetLowering::LowerFormalArguments( if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); - if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { + if (VA.needsCustom()) { // f64 and vector types are split up into multiple registers or // combinations of registers and stack slots. - SDValue ArgValue1 = - GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); - VA = ArgLocs[++i]; // skip ahead to next loc - SDValue ArgValue2; - if (VA.isMemLoc()) { - int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgValue2 = DAG.getLoad( - MVT::f64, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); - } else { - ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); - } - ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); - ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, - ArgValue1, DAG.getIntPtrConstant(0, dl)); - ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, - ArgValue2, DAG.getIntPtrConstant(1, dl)); - } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { - ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); + if (VA.getLocVT() == MVT::v2f64) { + SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], + Chain, DAG, dl); + VA = ArgLocs[++i]; // skip ahead to next loc + SDValue ArgValue2; + if (VA.isMemLoc()) { + int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), FI)); + } else { + ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], + Chain, DAG, dl); + } + ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue1, + DAG.getIntPtrConstant(0, dl)); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue2, + DAG.getIntPtrConstant(1, dl)); + } else + ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); } else { const TargetRegisterClass *RC; - if (RegVT == MVT::f16 || RegVT == MVT::bf16) + + if (RegVT == MVT::f16) RC = &ARM::HPRRegClass; else if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; - else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 || - RegVT == MVT::v4bf16) + else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) RC = &ARM::DPRRegClass; - else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 || - RegVT == MVT::v8bf16) + else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) RC = &ARM::QPRRegClass; else if (RegVT == MVT::i32) RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass @@ -4570,7 +4100,7 @@ SDValue ARMTargetLowering::LowerFormalArguments( llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. - Register Reg = MF.addLiveIn(VA.getLocReg(), RC); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); // If this value is passed in r0 and has the returned attribute (e.g. @@ -4601,16 +4131,9 @@ SDValue ARMTargetLowering::LowerFormalArguments( break; } - // f16 arguments have their size extended to 4 bytes and passed as if they - // had been copied to the LSBs of a 32-bit register. - // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) - if (VA.needsCustom() && - (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) - ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue); - InVals.push_back(ArgValue); } else { // VA.isRegLoc() - // Only arguments passed on the stack should make it here. + // sanity check assert(VA.isMemLoc()); assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); @@ -4653,35 +4176,12 @@ SDValue ARMTargetLowering::LowerFormalArguments( } // varargs - if (isVarArg && MFI.hasVAStart()) { - VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(), + if (isVarArg && MFI.hasVAStart()) + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, + CCInfo.getNextStackOffset(), TotalArgRegsSaveSize); - if (AFI->isCmseNSEntryFunction()) { - DiagnosticInfoUnsupported Diag( - DAG.getMachineFunction().getFunction(), - "secure entry function must not be variadic", dl.getDebugLoc()); - DAG.getContext()->diagnose(Diag); - } - } - unsigned StackArgSize = CCInfo.getNextStackOffset(); - bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; - if (canGuaranteeTCO(CallConv, TailCallOpt)) { - // The only way to guarantee a tail call is if the callee restores its - // argument area, but it must also keep the stack aligned when doing so. - const DataLayout &DL = DAG.getDataLayout(); - StackArgSize = alignTo(StackArgSize, DL.getStackAlignment()); - - AFI->setArgumentStackToRestore(StackArgSize); - } - AFI->setArgumentStackSize(StackArgSize); - - if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) { - DiagnosticInfoUnsupported Diag( - DAG.getMachineFunction().getFunction(), - "secure entry function requires arguments on stack", dl.getDebugLoc()); - DAG.getContext()->diagnose(Diag); - } + AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); return Chain; } @@ -5046,49 +4546,24 @@ SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } -static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { +static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { EVT VT = Op.getValueType(); - if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) + if (!Subtarget->hasDSP()) return SDValue(); if (!VT.isSimple()) return SDValue(); unsigned NewOpcode; + bool IsAdd = Op->getOpcode() == ISD::SADDSAT; switch (VT.getSimpleVT().SimpleTy) { default: return SDValue(); case MVT::i8: - switch (Op->getOpcode()) { - case ISD::UADDSAT: - NewOpcode = ARMISD::UQADD8b; - break; - case ISD::SADDSAT: - NewOpcode = ARMISD::QADD8b; - break; - case ISD::USUBSAT: - NewOpcode = ARMISD::UQSUB8b; - break; - case ISD::SSUBSAT: - NewOpcode = ARMISD::QSUB8b; - break; - } + NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b; break; case MVT::i16: - switch (Op->getOpcode()) { - case ISD::UADDSAT: - NewOpcode = ARMISD::UQADD16b; - break; - case ISD::SADDSAT: - NewOpcode = ARMISD::QADD16b; - break; - case ISD::USUBSAT: - NewOpcode = ARMISD::UQSUB16b; - break; - case ISD::SSUBSAT: - NewOpcode = ARMISD::QSUB16b; - break; - } + NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b; break; } @@ -5268,6 +4743,16 @@ static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); } +// Similar to isLowerSaturate(), but checks for upper-saturating conditions. +static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, + const SDValue TrueVal, const SDValue FalseVal, + const ISD::CondCode CC, const SDValue K) { + return (isGTorGE(CC) && + ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || + (isLTorLE(CC) && + ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); +} + // Check if two chained conditionals could be converted into SSAT or USAT. // // SSAT can replace a set of two conditional selectors that bound a number to an @@ -5279,68 +4764,101 @@ static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, // x < k ? (x < -k ? -k : x) : k // etc. // -// LLVM canonicalizes these to either a min(max()) or a max(min()) -// pattern. This function tries to match one of these and will return a SSAT -// node if successful. +// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is +// a power of 2. // -// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 -// is a power of 2. -static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); - SDValue V1 = Op.getOperand(0); - SDValue K1 = Op.getOperand(1); +// It returns true if the conversion can be done, false otherwise. +// Additionally, the variable is returned in parameter V, the constant in K and +// usat is set to true if the conditional represents an unsigned saturation +static bool isSaturatingConditional(const SDValue &Op, SDValue &V, + uint64_t &K, bool &usat) { + SDValue LHS1 = Op.getOperand(0); + SDValue RHS1 = Op.getOperand(1); SDValue TrueVal1 = Op.getOperand(2); SDValue FalseVal1 = Op.getOperand(3); ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; if (Op2.getOpcode() != ISD::SELECT_CC) - return SDValue(); + return false; - SDValue V2 = Op2.getOperand(0); - SDValue K2 = Op2.getOperand(1); + SDValue LHS2 = Op2.getOperand(0); + SDValue RHS2 = Op2.getOperand(1); SDValue TrueVal2 = Op2.getOperand(2); SDValue FalseVal2 = Op2.getOperand(3); ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); - SDValue V1Tmp = V1; - SDValue V2Tmp = V2; + // Find out which are the constants and which are the variables + // in each conditional + SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) + ? &RHS1 + : nullptr; + SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) + ? &RHS2 + : nullptr; + SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; + SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; + SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; + SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; + + // We must detect cases where the original operations worked with 16- or + // 8-bit values. In such case, V2Tmp != V2 because the comparison operations + // must work with sign-extended values but the select operations return + // the original non-extended value. + SDValue V2TmpReg = V2Tmp; + if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) + V2TmpReg = V2Tmp->getOperand(0); + + // Check that the registers and the constants have the correct values + // in both conditionals + if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || + V2TmpReg != V2) + return false; - // Check that the registers and the constants match a max(min()) or min(max()) - // pattern - if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 || - K2 != FalseVal2 || - !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2)))) - return SDValue(); + // Figure out which conditional is saturating the lower/upper bound. + const SDValue *LowerCheckOp = + isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) + ? &Op + : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) + ? &Op2 + : nullptr; + const SDValue *UpperCheckOp = + isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) + ? &Op + : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) + ? &Op2 + : nullptr; + + if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) + return false; // Check that the constant in the lower-bound check is // the opposite of the constant in the upper-bound check // in 1's complement. - if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2)) - return SDValue(); - - int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue(); - int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue(); + int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); + int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); int64_t PosVal = std::max(Val1, Val2); int64_t NegVal = std::min(Val1, Val2); - if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) || - !isPowerOf2_64(PosVal + 1)) - return SDValue(); + if (((Val1 > Val2 && UpperCheckOp == &Op) || + (Val1 < Val2 && UpperCheckOp == &Op2)) && + isPowerOf2_64(PosVal + 1)) { - // Handle the difference between USAT (unsigned) and SSAT (signed) - // saturation - // At this point, PosVal is guaranteed to be positive - uint64_t K = PosVal; - SDLoc dl(Op); - if (Val1 == ~Val2) - return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp, - DAG.getConstant(countTrailingOnes(K), dl, VT)); - if (NegVal == 0) - return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp, - DAG.getConstant(countTrailingOnes(K), dl, VT)); + // Handle the difference between USAT (unsigned) and SSAT (signed) saturation + if (Val1 == ~Val2) + usat = false; + else if (NegVal == 0) + usat = true; + else + return false; - return SDValue(); + V = V2; + K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive + + return true; + } + + return false; } // Check if a condition of the type x < k ? k : x can be converted into a @@ -5400,9 +4918,18 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // Try to convert two saturating conditional selects into a single SSAT - if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) - if (SDValue SatValue = LowerSaturatingConditional(Op, DAG)) - return SatValue; + SDValue SatValue; + uint64_t SatConstant; + bool SatUSat; + if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && + isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { + if (SatUSat) + return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, + DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); + else + return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, + DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); + } // Try to convert expressions of the form x < k ? k : x (and similar forms) // into more efficient bit operations, which is possible when k is 0 or -1 @@ -5411,7 +4938,6 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // instructions. // Only allow this transformation on full-width (32-bit) operations SDValue LowerSatConstant; - SDValue SatValue; if (VT == MVT::i32 && isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, @@ -5469,6 +4995,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { std::swap(TVal, FVal); CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } + if (TVal == 0) + TrueVal = DAG.getRegister(ARM::ZR, MVT::i32); // Drops F's value because we can get it by inverting/negating TVal. FalseVal = TrueVal; @@ -5590,7 +5118,7 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->getAlign(), + Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); llvm_unreachable("Unknown VFP cmp argument!"); @@ -5610,14 +5138,14 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue Ptr = Ld->getBasePtr(); RetVal1 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), - Ld->getAlign(), Ld->getMemOperand()->getFlags()); + Ld->getAlignment(), Ld->getMemOperand()->getFlags()); EVT PtrType = Ptr.getValueType(); + unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); SDValue NewPtr = DAG.getNode(ISD::ADD, dl, PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, - Ld->getPointerInfo().getWithOffset(4), - commonAlignment(Ld->getAlign(), 4), + Ld->getPointerInfo().getWithOffset(4), NewAlign, Ld->getMemOperand()->getFlags()); return; } @@ -5844,7 +5372,8 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { return DAG.UnrollVectorOp(Op.getNode()); } - const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16(); + const bool HasFullFP16 = + static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); EVT NewTy; const EVT OpTy = Op.getOperand(0).getValueType(); @@ -5903,43 +5432,6 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { return Op; } -static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { - EVT VT = Op.getValueType(); - EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); - EVT FromVT = Op.getOperand(0).getValueType(); - - if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32) - return Op; - if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 && - Subtarget->hasFP64()) - return Op; - if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 && - Subtarget->hasFullFP16()) - return Op; - if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 && - Subtarget->hasMVEFloatOps()) - return Op; - if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 && - Subtarget->hasMVEFloatOps()) - return Op; - - if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16) - return SDValue(); - - SDLoc DL(Op); - bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT; - unsigned BW = ToVT.getScalarSizeInBits() - IsSigned; - SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), - DAG.getValueType(VT.getScalarType())); - SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT, - DAG.getConstant((1 << BW) - 1, DL, VT)); - if (IsSigned) - Max = DAG.getNode(ISD::SMAX, DL, VT, Max, - DAG.getConstant(-(1 << BW), DL, VT)); - return Max; -} - static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); SDLoc dl(Op); @@ -5954,7 +5446,8 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { Op.getOperand(0).getValueType() == MVT::v8i16) && "Invalid type for custom lowering!"); - const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16(); + const bool HasFullFP16 = + static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); EVT DestVecType; if (VT == MVT::v4f32) @@ -6106,7 +5599,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ } // Return LR, which contains the return address. Mark it an implicit live-in. - Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); + unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } @@ -6216,27 +5709,85 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 /// operand type is illegal (e.g., v2f32 for a target that doesn't support /// vectors), since the legalizer won't know what to do with that. -SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) const { +static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); SDValue Op = N->getOperand(0); - // This function is only supposed to be called for i16 and i64 types, either - // as the source or destination of the bit convert. + // This function is only supposed to be called for i64 types, either as the + // source or destination of the bit convert. EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); + const bool HasFullFP16 = Subtarget->hasFullFP16(); + + if (SrcVT == MVT::f32 && DstVT == MVT::i32) { + // FullFP16: half values are passed in S-registers, and we don't + // need any of the bitcast and moves: + // + // t2: f32,ch = CopyFromReg t0, Register:f32 %0 + // t5: i32 = bitcast t2 + // t18: f16 = ARMISD::VMOVhr t5 + if (Op.getOpcode() != ISD::CopyFromReg || + Op.getValueType() != MVT::f32) + return SDValue(); + + auto Move = N->use_begin(); + if (Move->getOpcode() != ARMISD::VMOVhr) + return SDValue(); + + SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; + SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); + DAG.ReplaceAllUsesWith(*Move, &Copy); + return Copy; + } + + if (SrcVT == MVT::i16 && DstVT == MVT::f16) { + if (!HasFullFP16) + return SDValue(); + // SoftFP: read half-precision arguments: + // + // t2: i32,ch = ... + // t7: i16 = truncate t2 <~~~~ Op + // t8: f16 = bitcast t7 <~~~~ N + // + if (Op.getOperand(0).getValueType() == MVT::i32) + return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), + MVT::f16, Op.getOperand(0)); - if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) && - (DstVT == MVT::f16 || DstVT == MVT::bf16)) - return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(), - DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op)); + return SDValue(); + } - if ((DstVT == MVT::i16 || DstVT == MVT::i32) && - (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) - return DAG.getNode( - ISD::TRUNCATE, SDLoc(N), DstVT, - MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op)); + // Half-precision return values + if (SrcVT == MVT::f16 && DstVT == MVT::i16) { + if (!HasFullFP16) + return SDValue(); + // + // t11: f16 = fadd t8, t10 + // t12: i16 = bitcast t11 <~~~ SDNode N + // t13: i32 = zero_extend t12 + // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 + // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 + // + // transform this into: + // + // t20: i32 = ARMISD::VMOVrh t11 + // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 + // + auto ZeroExtend = N->use_begin(); + if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || + ZeroExtend->getValueType(0) != MVT::i32) + return SDValue(); + + auto Copy = ZeroExtend->use_begin(); + if (Copy->getOpcode() == ISD::CopyToReg && + Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { + SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); + DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); + return Cvt; + } + return SDValue(); + } if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) return SDValue(); @@ -6372,69 +5923,23 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, return DAG.getMergeValues(Ops, dl); } -SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op, - SelectionDAG &DAG) const { +SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, + SelectionDAG &DAG) const { // The rounding mode is in bits 23:22 of the FPSCR. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); - SDValue Chain = Op.getOperand(0); - SDValue Ops[] = {Chain, - DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)}; + SDValue Ops[] = { DAG.getEntryNode(), + DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; - SDValue FPSCR = - DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops); - Chain = FPSCR.getValue(1); + SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); - SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, - DAG.getConstant(3, dl, MVT::i32)); - return DAG.getMergeValues({And, Chain}, dl); -} - -SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - SDValue Chain = Op->getOperand(0); - SDValue RMValue = Op->getOperand(1); - - // The rounding mode is in bits 23:22 of the FPSCR. - // The llvm.set.rounding argument value to ARM rounding mode value mapping - // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is - // ((arg - 1) & 3) << 22). - // - // It is expected that the argument of llvm.set.rounding is within the - // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is - // responsibility of the code generated llvm.set.rounding to ensure this - // condition. - - // Calculate new value of FPSCR[23:22]. - RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue, - DAG.getConstant(1, DL, MVT::i32)); - RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue, - DAG.getConstant(0x3, DL, MVT::i32)); - RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue, - DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32)); - - // Get current value of FPSCR. - SDValue Ops[] = {Chain, - DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)}; - SDValue FPSCR = - DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops); - Chain = FPSCR.getValue(1); - FPSCR = FPSCR.getValue(0); - - // Put new rounding mode into FPSCR[23:22]. - const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos); - FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR, - DAG.getConstant(RMMask, DL, MVT::i32)); - FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue); - SDValue Ops2[] = { - Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR}; - return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); + return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, + DAG.getConstant(3, dl, MVT::i32)); } static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, @@ -6766,23 +6271,23 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, if (ST->hasMVEFloatOps()) { Opc = ARMCC::NE; break; } else { - Invert = true; [[fallthrough]]; + Invert = true; LLVM_FALLTHROUGH; } case ISD::SETOEQ: case ISD::SETEQ: Opc = ARMCC::EQ; break; case ISD::SETOLT: - case ISD::SETLT: Swap = true; [[fallthrough]]; + case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: Opc = ARMCC::GT; break; case ISD::SETOLE: - case ISD::SETLE: Swap = true; [[fallthrough]]; + case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETOGE: case ISD::SETGE: Opc = ARMCC::GE; break; - case ISD::SETUGE: Swap = true; [[fallthrough]]; + case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; - case ISD::SETUGT: Swap = true; [[fallthrough]]; + case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; - case ISD::SETUEQ: Invert = true; [[fallthrough]]; + case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; case ISD::SETONE: { // Expand this to (OLT | OGT). SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, @@ -6794,7 +6299,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, Result = DAG.getNOT(dl, Result, VT); return Result; } - case ISD::SETUO: Invert = true; [[fallthrough]]; + case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; case ISD::SETO: { // Expand this to (OLT | OGE). SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, @@ -6815,16 +6320,16 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, if (ST->hasMVEIntegerOps()) { Opc = ARMCC::NE; break; } else { - Invert = true; [[fallthrough]]; + Invert = true; LLVM_FALLTHROUGH; } case ISD::SETEQ: Opc = ARMCC::EQ; break; - case ISD::SETLT: Swap = true; [[fallthrough]]; + case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGT: Opc = ARMCC::GT; break; - case ISD::SETLE: Swap = true; [[fallthrough]]; + case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGE: Opc = ARMCC::GE; break; - case ISD::SETULT: Swap = true; [[fallthrough]]; + case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: Opc = ARMCC::HI; break; - case ISD::SETULE: Swap = true; [[fallthrough]]; + case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGE: Opc = ARMCC::HS; break; } @@ -6856,25 +6361,25 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, // If one of the operands is a constant vector zero, attempt to fold the // comparison to a specialized compare-against-zero form. - if (ISD::isBuildVectorAllZeros(Op0.getNode()) && - (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ || - Opc == ARMCC::NE)) { + SDValue SingleOp; + if (ISD::isBuildVectorAllZeros(Op1.getNode())) + SingleOp = Op0; + else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { if (Opc == ARMCC::GE) Opc = ARMCC::LE; else if (Opc == ARMCC::GT) Opc = ARMCC::LT; - std::swap(Op0, Op1); + SingleOp = Op1; } SDValue Result; - if (ISD::isBuildVectorAllZeros(Op1.getNode()) && - (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE || - Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ)) - Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0, + if (SingleOp.getNode()) { + Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, DAG.getConstant(Opc, dl, MVT::i32)); - else + } else { Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, DAG.getConstant(Opc, dl, MVT::i32)); + } Result = DAG.getSExtOrTrunc(Result, dl, VT); @@ -6919,10 +6424,9 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { /// immediate" operand (e.g., VMOV). If so, return the encoded value. static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, - const SDLoc &dl, EVT &VT, EVT VectorVT, + const SDLoc &dl, EVT &VT, bool is128Bits, VMOVModImmType type) { unsigned OpCmode, Imm; - bool is128Bits = VectorVT.is128BitVector(); // SplatBitSize is set to the smallest size that splats the vector, so a // zero vector will always have SplatBitSize == 8. However, NEON modified @@ -7026,10 +6530,12 @@ static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, return SDValue(); // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. uint64_t BitMask = 0xff; + uint64_t Val = 0; unsigned ImmMask = 1; Imm = 0; for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { if (((SplatBits | SplatUndef) & BitMask) == BitMask) { + Val |= BitMask; Imm |= ImmMask; } else if ((SplatBits & BitMask) != 0) { return SDValue(); @@ -7038,18 +6544,9 @@ static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, ImmMask <<= 1; } - if (DAG.getDataLayout().isBigEndian()) { - // Reverse the order of elements within the vector. - unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8; - unsigned Mask = (1 << BytesPerElem) - 1; - unsigned NumElems = 8 / BytesPerElem; - unsigned NewImm = 0; - for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) { - unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask); - NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem; - } - Imm = NewImm; - } + if (DAG.getDataLayout().isBigEndian()) + // swap higher and lower 32 bit word + Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); // Op=1, Cmode=1110. OpCmode = 0x1e; @@ -7088,6 +6585,8 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, case MVT::f64: { SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); + if (!ST->isLittle()) + std::swap(Lo, Hi); return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); } case MVT::f32: @@ -7140,7 +6639,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), - VMovVT, VT, VMOVModImm); + VMovVT, false, VMOVModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, @@ -7157,7 +6656,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, // Finally, try a VMVN.i32 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, - VT, VMVNModImm); + false, VMVNModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); @@ -7241,6 +6740,35 @@ static bool isVEXTMask(ArrayRef<int> M, EVT VT, return true; } +/// isVREVMask - Check if a vector shuffle corresponds to a VREV +/// instruction with the specified blocksize. (The order of the elements +/// within each block of the vector is reversed.) +static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { + assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && + "Only possible block sizes for VREV are: 16, 32, 64"); + + unsigned EltSz = VT.getScalarSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned BlockElts = M[0] + 1; + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSz; + + if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + if (M[i] < 0) continue; // ignore UNDEF indices + if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) + return false; + } + + return true; +} + static bool isVTBLMask(ArrayRef<int> M, EVT VT) { // We can handle <8 x i8> vector shuffles. If the index in the mask is out of // range, then 0 is placed into the resulting vector. So pretty much any mask @@ -7513,33 +7041,11 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) { return true; } -static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) { +static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) { unsigned NumElts = VT.getVectorNumElements(); // Make sure the mask has the right size. if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) - return false; - - // Half-width truncation patterns (e.g. v4i32 -> v8i16): - // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6> - // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14> - // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7> - // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15> - int Ofs = Top ? 1 : 0; - int Upper = SingleSource ? 0 : NumElts; - for (int i = 0, e = NumElts / 2; i != e; ++i) { - if (M[i] >= 0 && M[i] != (i * 2) + Ofs) - return false; - if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper) return false; - } - return true; -} - -static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) { - unsigned NumElts = VT.getVectorNumElements(); - // Make sure the mask has the right size. - if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) - return false; // If Top // Look for <0, N, 2, N+2, 4, N+4, ..>. @@ -7548,137 +7054,16 @@ static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) { // Look for <0, N+1, 2, N+3, 4, N+5, ..> // This inserts Input1 into Input2 unsigned Offset = Top ? 0 : 1; - unsigned N = SingleSource ? 0 : NumElts; - for (unsigned i = 0; i < NumElts; i += 2) { + for (unsigned i = 0; i < NumElts; i+=2) { if (M[i] >= 0 && M[i] != (int)i) return false; - if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset)) + if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset)) return false; } return true; } -static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) { - unsigned NumElts = ToVT.getVectorNumElements(); - if (NumElts != M.size()) - return false; - - // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are - // looking for patterns of: - // !rev: 0 N/2 1 N/2+1 2 N/2+2 ... - // rev: N/2 0 N/2+1 1 N/2+2 2 ... - - unsigned Off0 = rev ? NumElts / 2 : 0; - unsigned Off1 = rev ? 0 : NumElts / 2; - for (unsigned i = 0; i < NumElts; i += 2) { - if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2)) - return false; - if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2)) - return false; - } - - return true; -} - -// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted -// from a pair of inputs. For example: -// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), -// FP_ROUND(EXTRACT_ELT(Y, 0), -// FP_ROUND(EXTRACT_ELT(X, 1), -// FP_ROUND(EXTRACT_ELT(Y, 1), ...) -static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, - const ARMSubtarget *ST) { - assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); - if (!ST->hasMVEFloatOps()) - return SDValue(); - - SDLoc dl(BV); - EVT VT = BV.getValueType(); - if (VT != MVT::v8f16) - return SDValue(); - - // We are looking for a buildvector of fptrunc elements, where all the - // elements are interleavingly extracted from two sources. Check the first two - // items are valid enough and extract some info from them (they are checked - // properly in the loop below). - if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND || - BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || - BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0) - return SDValue(); - if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND || - BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || - BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0) - return SDValue(); - SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); - SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0); - if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32) - return SDValue(); - - // Check all the values in the BuildVector line up with our expectations. - for (unsigned i = 1; i < 4; i++) { - auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { - return Trunc.getOpcode() == ISD::FP_ROUND && - Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - Trunc.getOperand(0).getOperand(0) == Op && - Trunc.getOperand(0).getConstantOperandVal(1) == Idx; - }; - if (!Check(BV.getOperand(i * 2 + 0), Op0, i)) - return SDValue(); - if (!Check(BV.getOperand(i * 2 + 1), Op1, i)) - return SDValue(); - } - - SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0, - DAG.getConstant(0, dl, MVT::i32)); - return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1, - DAG.getConstant(1, dl, MVT::i32)); -} - -// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted -// from a single input on alternating lanes. For example: -// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), -// FP_ROUND(EXTRACT_ELT(X, 2), -// FP_ROUND(EXTRACT_ELT(X, 4), ...) -static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, - const ARMSubtarget *ST) { - assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); - if (!ST->hasMVEFloatOps()) - return SDValue(); - - SDLoc dl(BV); - EVT VT = BV.getValueType(); - if (VT != MVT::v4f32) - return SDValue(); - - // We are looking for a buildvector of fptext elements, where all the - // elements are alternating lanes from a single source. For example <0,2,4,6> - // or <1,3,5,7>. Check the first two items are valid enough and extract some - // info from them (they are checked properly in the loop below). - if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND || - BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); - SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); - int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1); - if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1)) - return SDValue(); - - // Check all the values in the BuildVector line up with our expectations. - for (unsigned i = 1; i < 4; i++) { - auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { - return Trunc.getOpcode() == ISD::FP_EXTEND && - Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - Trunc.getOperand(0).getOperand(0) == Op && - Trunc.getOperand(0).getConstantOperandVal(1) == Idx; - }; - if (!Check(BV.getOperand(i), Op0, 2 * i + Offset)) - return SDValue(); - } - - return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0, - DAG.getConstant(Offset, dl, MVT::i32)); -} - // If N is an integer constant that can be moved into a register in one // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. @@ -7709,10 +7094,7 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, unsigned NumElts = VT.getVectorNumElements(); unsigned BoolMask; unsigned BitsPerBool; - if (NumElts == 2) { - BitsPerBool = 8; - BoolMask = 0xff; - } else if (NumElts == 4) { + if (NumElts == 4) { BitsPerBool = 4; BoolMask = 0xf; } else if (NumElts == 8) { @@ -7728,9 +7110,10 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, // extend that single value SDValue FirstOp = Op.getOperand(0); if (!isa<ConstantSDNode>(FirstOp) && - llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) { - return U.get().isUndef() || U.get() == FirstOp; - })) { + std::all_of(std::next(Op->op_begin()), Op->op_end(), + [&FirstOp](SDUse &U) { + return U.get().isUndef() || U.get() == FirstOp; + })) { SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, DAG.getValueType(MVT::i1)); return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); @@ -7761,79 +7144,6 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, return Base; } -static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *ST) { - if (!ST->hasMVEIntegerOps()) - return SDValue(); - - // We are looking for a buildvector where each element is Op[0] + i*N - EVT VT = Op.getValueType(); - SDValue Op0 = Op.getOperand(0); - unsigned NumElts = VT.getVectorNumElements(); - - // Get the increment value from operand 1 - SDValue Op1 = Op.getOperand(1); - if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 || - !isa<ConstantSDNode>(Op1.getOperand(1))) - return SDValue(); - unsigned N = Op1.getConstantOperandVal(1); - if (N != 1 && N != 2 && N != 4 && N != 8) - return SDValue(); - - // Check that each other operand matches - for (unsigned I = 2; I < NumElts; I++) { - SDValue OpI = Op.getOperand(I); - if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 || - !isa<ConstantSDNode>(OpI.getOperand(1)) || - OpI.getConstantOperandVal(1) != I * N) - return SDValue(); - } - - SDLoc DL(Op); - return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0, - DAG.getConstant(N, DL, MVT::i32)); -} - -// Returns true if the operation N can be treated as qr instruction variant at -// operand Op. -static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) { - switch (N->getOpcode()) { - case ISD::ADD: - case ISD::MUL: - case ISD::SADDSAT: - case ISD::UADDSAT: - return true; - case ISD::SUB: - case ISD::SSUBSAT: - case ISD::USUBSAT: - return N->getOperand(1).getNode() == Op; - case ISD::INTRINSIC_WO_CHAIN: - switch (N->getConstantOperandVal(0)) { - case Intrinsic::arm_mve_add_predicated: - case Intrinsic::arm_mve_mul_predicated: - case Intrinsic::arm_mve_qadd_predicated: - case Intrinsic::arm_mve_vhadd: - case Intrinsic::arm_mve_hadd_predicated: - case Intrinsic::arm_mve_vqdmulh: - case Intrinsic::arm_mve_qdmulh_predicated: - case Intrinsic::arm_mve_vqrdmulh: - case Intrinsic::arm_mve_qrdmulh_predicated: - case Intrinsic::arm_mve_vqdmull: - case Intrinsic::arm_mve_vqdmull_predicated: - return true; - case Intrinsic::arm_mve_sub_predicated: - case Intrinsic::arm_mve_qsub_predicated: - case Intrinsic::arm_mve_vhsub: - case Intrinsic::arm_mve_hsub_predicated: - return N->getOperand(2).getNode() == Op; - default: - return false; - } - default: - return false; - } -} - // If this is a case we can't handle, return null and let the default // expansion code take care of it. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, @@ -7845,37 +7155,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) return LowerBUILD_VECTOR_i1(Op, DAG, ST); - if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST)) - return R; - APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - if (SplatUndef.isAllOnes()) + if (SplatUndef.isAllOnesValue()) return DAG.getUNDEF(VT); - // If all the users of this constant splat are qr instruction variants, - // generate a vdup of the constant. - if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize && - (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) && - all_of(BVN->uses(), - [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) { - EVT DupVT = SplatBitSize == 32 ? MVT::v4i32 - : SplatBitSize == 16 ? MVT::v8i16 - : MVT::v16i8; - SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32); - SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const); - return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup); - } - if ((ST->hasNEON() && SplatBitSize <= 64) || - (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) { + (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { // Check if an immediate VMOV works. EVT VmovVT; - SDValue Val = - isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), - SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm); + SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, dl, VmovVT, VT.is128BitVector(), + VMOVModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); @@ -7885,8 +7179,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Try an immediate VMVN. uint64_t NegatedImm = (~SplatBits).getZExtValue(); Val = isVMOVModifiedImm( - NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, - VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); + NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, + DAG, dl, VmovVT, VT.is128BitVector(), + ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); @@ -7900,18 +7195,6 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); } } - - // If we are under MVE, generate a VDUP(constant), bitcast to the original - // type. - if (ST->hasMVEIntegerOps() && - (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) { - EVT DupVT = SplatBitSize == 32 ? MVT::v4i32 - : SplatBitSize == 16 ? MVT::v8i16 - : MVT::v16i8; - SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32); - SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const); - return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup); - } } } @@ -8038,19 +7321,12 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (isConstant) return SDValue(); - // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and - // vmovn). Empirical tests suggest this is rarely worth it for vectors of - // length <= 2. - if (NumElts >= 4) - if (SDValue shuffle = ReconstructShuffle(Op, DAG)) + // Empirical tests suggest this is rarely worth it for vectors of length <= 2. + if (NumElts >= 4) { + SDValue shuffle = ReconstructShuffle(Op, DAG); + if (shuffle != SDValue()) return shuffle; - - // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into - // VCVT's - if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget)) - return VCVT; - if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget)) - return VCVT; + } if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { // If we haven't found an efficient lowering, try splitting a 128-bit vector @@ -8058,11 +7334,12 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); EVT ExtVT = VT.getVectorElementType(); EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); - SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2)); + SDValue Lower = + DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); if (Lower.getOpcode() == ISD::BUILD_VECTOR) Lower = LowerBUILD_VECTOR(Lower, DAG, ST); - SDValue Upper = - DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2)); + SDValue Upper = DAG.getBuildVector( + HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); if (Upper.getOpcode() == ISD::BUILD_VECTOR) Upper = LowerBUILD_VECTOR(Upper, DAG, ST); if (Lower && Upper) @@ -8187,19 +7464,17 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); - uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); - uint64_t VTSize = VT.getFixedSizeInBits(); - if (SrcVTSize == VTSize) + if (SrcVT.getSizeInBits() == VT.getSizeInBits()) continue; // This stage of the search produces a source with the same element type as // the original, but with a total width matching the BUILD_VECTOR output. EVT EltVT = SrcVT.getVectorElementType(); - unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); + unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); - if (SrcVTSize < VTSize) { - if (2 * SrcVTSize != VTSize) + if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { + if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) return SDValue(); // We can pad out the smaller vector for free, so if it's part of a // shuffle... @@ -8209,7 +7484,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, continue; } - if (SrcVTSize != 2 * VTSize) + if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) return SDValue(); if (Src.MaxElt - Src.MinElt >= NumSrcElts) { @@ -8252,12 +7527,12 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, if (SrcEltTy == SmallestEltTy) continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); - Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec); + Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); Src.WindowBase *= Src.WindowScale; } - // Final check before we try to actually produce a shuffle. + // Final sanity check before we try to actually produce a shuffle. LLVM_DEBUG(for (auto Src : Sources) assert(Src.ShuffleVec.getValueType() == ShuffleVT);); @@ -8277,7 +7552,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, // trunc. So only std::min(SrcBits, DestBits) actually get defined in this // segment. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); - int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), + int BitsDefined = std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits()); int LanesDefined = BitsDefined / BitsPerShuffleLane; @@ -8304,7 +7579,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, ShuffleOps[1], Mask, DAG); if (!Shuffle) return SDValue(); - return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle); + return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } enum ShuffleOpCodes { @@ -8380,17 +7655,11 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { isVTBLMask(M, VT) || isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) return true; - else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && + else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)) return true; else if (Subtarget->hasMVEIntegerOps() && - (isVMOVNMask(M, VT, true, false) || - isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true))) - return true; - else if (Subtarget->hasMVEIntegerOps() && - (isTruncMask(M, VT, false, false) || - isTruncMask(M, VT, false, true) || - isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true))) + (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1))) return true; else return false; @@ -8420,13 +7689,14 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, default: llvm_unreachable("Unknown shuffle opcode!"); case OP_VREV: // VREV divides the vector in half and swaps within the half. - if (VT.getScalarSizeInBits() == 32) + if (VT.getVectorElementType() == MVT::i32 || + VT.getVectorElementType() == MVT::f32) return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); // vrev <4 x i16> -> VREV32 - if (VT.getScalarSizeInBits() == 16) + if (VT.getVectorElementType() == MVT::i16) return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); // vrev <4 x i8> -> VREV16 - assert(VT.getScalarSizeInBits() == 8); + assert(VT.getVectorElementType() == MVT::i8); return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); case OP_VDUP0: case OP_VDUP1: @@ -8464,8 +7734,9 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, SDLoc DL(Op); SmallVector<SDValue, 8> VTBLMask; - for (int I : ShuffleMask) - VTBLMask.push_back(DAG.getConstant(I, DL, MVT::i32)); + for (ArrayRef<int>::iterator + I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) + VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); if (V2.getNode()->isUndef()) return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, @@ -8475,29 +7746,25 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); } -static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, + SelectionDAG &DAG) { SDLoc DL(Op); - EVT VT = Op.getValueType(); + SDValue OpLHS = Op.getOperand(0); + EVT VT = OpLHS.getValueType(); - assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && + assert((VT == MVT::v8i16 || VT == MVT::v16i8) && "Expect an v8i16/v16i8 type"); - SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0)); - // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now, + OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); + // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, // extract the first 8 bytes into the top double word and the last 8 bytes - // into the bottom double word, through a new vector shuffle that will be - // turned into a VEXT on Neon, or a couple of VMOVDs on MVE. - std::vector<int> NewMask; - for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) - NewMask.push_back(VT.getVectorNumElements() / 2 + i); - for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) - NewMask.push_back(i); - return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask); + // into the bottom double word. The v8i16 case is similar. + unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; + return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, + DAG.getConstant(ExtractNum, DL, MVT::i32)); } static EVT getVectorTyFromPredicateVector(EVT VT) { switch (VT.getSimpleVT().SimpleTy) { - case MVT::v2i1: - return MVT::v2f64; case MVT::v4i1: return MVT::v4i32; case MVT::v8i1: @@ -8554,7 +7821,6 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, "No support for vector shuffle of boolean predicates"); SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); SDLoc dl(Op); if (isReverseMask(ShuffleMask, VT)) { SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); @@ -8572,26 +7838,15 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, // many cases the generated code might be even better than scalar code // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit // fields in a register into 8 other arbitrary 2-bit fields! - SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG); - EVT NewVT = PredAsVector1.getValueType(); - SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT) - : PromoteMVEPredVector(dl, V2, VT, DAG); - assert(PredAsVector2.getValueType() == NewVT && - "Expected identical vector type in expanded i1 shuffle!"); + SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); + EVT NewVT = PredAsVector.getValueType(); // Do the shuffle! - SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1, - PredAsVector2, ShuffleMask); + SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, + DAG.getUNDEF(NewVT), ShuffleMask); // Now return the result of comparing the shuffled vector with zero, - // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 - // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s. - if (VT == MVT::v2i1) { - SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled); - SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC, - DAG.getConstant(ARMCC::NE, dl, MVT::i32)); - return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); - } + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } @@ -8649,8 +7904,8 @@ static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, Input = Op->getOperand(1); Elt -= 4; } - SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input); - Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast, + SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input); + Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast, DAG.getConstant(Elt, dl, MVT::i32)); } } @@ -8669,70 +7924,19 @@ static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); SDValue NewShuffle = DAG.getVectorShuffle( VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); - SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle); + SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle); for (int Part = 0; Part < 4; ++Part) if (!Parts[Part]) - Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, + Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast, DAG.getConstant(Part, dl, MVT::i32)); } // Build a vector out of the various parts and bitcast it back to the original // type. - SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts); + SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts); return DAG.getBitcast(VT, NewVec); } -static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, - ArrayRef<int> ShuffleMask, - SelectionDAG &DAG) { - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - EVT VT = Op.getValueType(); - unsigned NumElts = VT.getVectorNumElements(); - - // An One-Off Identity mask is one that is mostly an identity mask from as - // single source but contains a single element out-of-place, either from a - // different vector or from another position in the same vector. As opposed to - // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert - // pair directly. - auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset, - int &OffElement) { - OffElement = -1; - int NonUndef = 0; - for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) { - if (Mask[i] == -1) - continue; - NonUndef++; - if (Mask[i] != i + BaseOffset) { - if (OffElement == -1) - OffElement = i; - else - return false; - } - } - return NonUndef > 2 && OffElement != -1; - }; - int OffElement; - SDValue VInput; - if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement)) - VInput = V1; - else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement)) - VInput = V2; - else - return SDValue(); - - SDLoc dl(Op); - EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16 - ? MVT::i32 - : VT.getScalarType(); - SDValue Elt = DAG.getNode( - ISD::EXTRACT_VECTOR_ELT, dl, SVT, - ShuffleMask[OffElement] < (int)NumElts ? V1 : V2, - DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl)); - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt, - DAG.getVectorIdxConstant(OffElement % NumElts, dl)); -} - static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); @@ -8819,15 +8023,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, } } if (ST->hasMVEIntegerOps()) { - if (isVMOVNMask(ShuffleMask, VT, false, false)) + if (isVMOVNMask(ShuffleMask, VT, 0)) return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, DAG.getConstant(0, dl, MVT::i32)); - if (isVMOVNMask(ShuffleMask, VT, true, false)) + if (isVMOVNMask(ShuffleMask, VT, 1)) return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, DAG.getConstant(1, dl, MVT::i32)); - if (isVMOVNMask(ShuffleMask, VT, true, true)) - return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1, - DAG.getConstant(1, dl, MVT::i32)); } // Also check for these shuffles through CONCAT_VECTORS: we canonicalize @@ -8869,29 +8070,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, } } - if (ST->hasMVEIntegerOps() && EltSize <= 32) { - if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG)) - return V; - - for (bool Top : {false, true}) { - for (bool SingleSource : {false, true}) { - if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) { - MVT FromSVT = MVT::getIntegerVT(EltSize * 2); - MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2); - SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1); - SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, - SingleSource ? V1 : V2); - if (Top) { - SDValue Amt = DAG.getConstant(EltSize, dl, FromVT); - Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt); - Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt); - } - return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi); - } - } - } - } - // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); @@ -8946,9 +8124,8 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::BITCAST, dl, VT, Val); } - if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && - isReverseMask(ShuffleMask, VT)) - return LowerReverse_VECTOR_SHUFFLE(Op, DAG); + if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) + return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); if (ST->hasNEON() && VT == MVT::v8i8) if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) @@ -9065,75 +8242,54 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); SDLoc dl(Op); - assert(Op.getValueType().getScalarSizeInBits() == 1 && - "Unexpected custom CONCAT_VECTORS lowering"); - assert(isPowerOf2_32(Op.getNumOperands()) && + EVT VT = Op.getValueType(); + EVT Op1VT = V1.getValueType(); + EVT Op2VT = V2.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + assert(Op1VT == Op2VT && "Operand types don't match!"); + assert(VT.getScalarSizeInBits() == 1 && "Unexpected custom CONCAT_VECTORS lowering"); assert(ST->hasMVEIntegerOps() && "CONCAT_VECTORS lowering only supported for MVE"); - auto ConcatPair = [&](SDValue V1, SDValue V2) { - EVT Op1VT = V1.getValueType(); - EVT Op2VT = V2.getValueType(); - assert(Op1VT == Op2VT && "Operand types don't match!"); - EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext()); - - SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); - SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); - - // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets - // promoted to v8i16, etc. - MVT ElType = - getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); - unsigned NumElts = 2 * Op1VT.getVectorNumElements(); - - // Extract the vector elements from Op1 and Op2 one by one and truncate them - // to be the right size for the destination. For example, if Op1 is v4i1 - // then the promoted vector is v4i32. The result of concatenation gives a - // v8i1, which when promoted is v8i16. That means each i32 element from Op1 - // needs truncating to i16 and inserting in the result. - EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); - SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); - auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { - EVT NewVT = NewV.getValueType(); - EVT ConcatVT = ConVec.getValueType(); - for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, - DAG.getIntPtrConstant(i, dl)); - ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, - DAG.getConstant(j, dl, MVT::i32)); - } - return ConVec; - }; - unsigned j = 0; - ConVec = ExtractInto(NewV1, ConVec, j); - ConVec = ExtractInto(NewV2, ConVec, j); - - // Now return the result of comparing the subvector with zero, which will - // generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 we - // convert to a v4i1 compare to fill in the two halves of the i64 as i32s. - if (VT == MVT::v2i1) { - SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, ConVec); - SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC, - DAG.getConstant(ARMCC::NE, dl, MVT::i32)); - return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); + SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); + SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); + + // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets + // promoted to v8i16, etc. + + MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); + + // Extract the vector elements from Op1 and Op2 one by one and truncate them + // to be the right size for the destination. For example, if Op1 is v4i1 then + // the promoted vector is v4i32. The result of concatentation gives a v8i1, + // which when promoted is v8i16. That means each i32 element from Op1 needs + // truncating to i16 and inserting in the result. + EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); + SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); + auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { + EVT NewVT = NewV.getValueType(); + EVT ConcatVT = ConVec.getValueType(); + for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, + DAG.getIntPtrConstant(i, dl)); + ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, + DAG.getConstant(j, dl, MVT::i32)); } - return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, - DAG.getConstant(ARMCC::NE, dl, MVT::i32)); + return ConVec; }; + unsigned j = 0; + ConVec = ExractInto(NewV1, ConVec, j); + ConVec = ExractInto(NewV2, ConVec, j); - // Concat each pair of subvectors and pack into the lower half of the array. - SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end()); - while (ConcatOps.size() > 1) { - for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) { - SDValue V1 = ConcatOps[I]; - SDValue V2 = ConcatOps[I + 1]; - ConcatOps[I / 2] = ConcatPair(V1, V2); - } - ConcatOps.resize(ConcatOps.size() / 2); - } - return ConcatOps[0]; + // Now return the result of comparing the subvector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, @@ -9183,22 +8339,6 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); - if (NumElts == 2) { - EVT SubVT = MVT::v4i32; - SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); - for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) { - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, - DAG.getIntPtrConstant(i, dl)); - SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, - DAG.getConstant(j, dl, MVT::i32)); - SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, - DAG.getConstant(j + 1, dl, MVT::i32)); - } - SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec, - DAG.getConstant(ARMCC::NE, dl, MVT::i32)); - return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); - } - EVT SubVT = MVT::getVectorVT(ElType, NumElts); SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { @@ -9214,116 +8354,6 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } -// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0). -static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - assert(ST->hasMVEIntegerOps() && "Expected MVE!"); - EVT VT = N->getValueType(0); - assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) && - "Expected a vector i1 type!"); - SDValue Op = N->getOperand(0); - EVT FromVT = Op.getValueType(); - SDLoc DL(N); - - SDValue And = - DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT)); - return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT), - DAG.getCondCode(ISD::SETNE)); -} - -static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { - if (!Subtarget->hasMVEIntegerOps()) - return SDValue(); - - EVT ToVT = N->getValueType(0); - if (ToVT.getScalarType() == MVT::i1) - return LowerTruncatei1(N, DAG, Subtarget); - - // MVE does not have a single instruction to perform the truncation of a v4i32 - // into the lower half of a v8i16, in the same way that a NEON vmovn would. - // Most of the instructions in MVE follow the 'Beats' system, where moving - // values from different lanes is usually something that the instructions - // avoid. - // - // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B, - // which take a the top/bottom half of a larger lane and extend it (or do the - // opposite, truncating into the top/bottom lane from a larger lane). Note - // that because of the way we widen lanes, a v4i16 is really a v4i32 using the - // bottom 16bits from each vector lane. This works really well with T/B - // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need - // to move order. - // - // But truncates and sext/zext are always going to be fairly common from llvm. - // We have several options for how to deal with them: - // - Wherever possible combine them into an instruction that makes them - // "free". This includes loads/stores, which can perform the trunc as part - // of the memory operation. Or certain shuffles that can be turned into - // VMOVN/VMOVL. - // - Lane Interleaving to transform blocks surrounded by ext/trunc. So - // trunc(mul(sext(a), sext(b))) may become - // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in - // this case can use VMULL). This is performed in the - // MVELaneInterleavingPass. - // - Otherwise we have an option. By default we would expand the - // zext/sext/trunc into a series of lane extract/inserts going via GPR - // registers. One for each vector lane in the vector. This can obviously be - // very expensive. - // - The other option is to use the fact that loads/store can extend/truncate - // to turn a trunc into two truncating stack stores and a stack reload. This - // becomes 3 back-to-back memory operations, but at least that is less than - // all the insert/extracts. - // - // In order to do the last, we convert certain trunc's into MVETRUNC, which - // are either optimized where they can be, or eventually lowered into stack - // stores/loads. This prevents us from splitting a v8i16 trunc into two stores - // two early, where other instructions would be better, and stops us from - // having to reconstruct multiple buildvector shuffles into loads/stores. - if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8) - return SDValue(); - EVT FromVT = N->getOperand(0).getValueType(); - if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16) - return SDValue(); - - SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); - SDLoc DL(N); - return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi); -} - -static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { - if (!Subtarget->hasMVEIntegerOps()) - return SDValue(); - - // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC. - - EVT ToVT = N->getValueType(0); - if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16) - return SDValue(); - SDValue Op = N->getOperand(0); - EVT FromVT = Op.getValueType(); - if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8) - return SDValue(); - - SDLoc DL(N); - EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); - if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) - ExtVT = MVT::v8i16; - - unsigned Opcode = - N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT; - SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op); - SDValue Ext1 = Ext.getValue(1); - - if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) { - Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext); - Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1); - } - - return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1); -} - /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each /// element has been zero/sign-extended, depending on the isSigned parameter, /// from an integer type half its size. @@ -9349,7 +8379,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) return true; } else { - if (Hi0->isZero() && Hi1->isZero()) + if (Hi0->isNullValue() && Hi1->isNullValue()) return true; } return false; @@ -9388,11 +8418,10 @@ static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { return false; } -/// isZeroExtended - Check if a node is a vector value that is zero-extended (or -/// any-extended) or a constant BUILD_VECTOR with zero-extended elements. +/// isZeroExtended - Check if a node is a vector value that is zero-extended +/// or a constant BUILD_VECTOR with zero-extended elements. static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND || - ISD::isZEXTLoad(N)) + if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) return true; if (isExtendedBUILD_VECTOR(N, DAG, false)) return true; @@ -9447,27 +8476,26 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { // The load already has the right type. if (ExtendedTy == LD->getMemoryVT()) return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), - LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(), - LD->getMemOperand()->getFlags()); + LD->getBasePtr(), LD->getPointerInfo(), + LD->getAlignment(), LD->getMemOperand()->getFlags()); // We need to create a zextload/sextload. We cannot just create a load // followed by a zext/zext node because LowerMUL is also run during normal // operation legalization where we can't create illegal types. return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), - LD->getMemoryVT(), LD->getAlign(), + LD->getMemoryVT(), LD->getAlignment(), LD->getMemOperand()->getFlags()); } /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, -/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return -/// the unextended value. The unextended vector should be 64 bits so that it can +/// extending load, or BUILD_VECTOR with extended elements, return the +/// unextended value. The unextended vector should be 64 bits so that it can /// be used as an operand to a VMULL instruction. If the original vector size /// before extension is less than 64 bits we add a an extension to resize /// the vector to 64 bits. static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::SIGN_EXTEND || - N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) + if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, N->getOperand(0)->getValueType(0), N->getValueType(0), @@ -9864,7 +8892,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { if (ShouldUseSRet) { // Create stack object for sret. const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); - const Align StackAlign = DL.getPrefTypeAlign(RetTy); + const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); @@ -9964,7 +8992,7 @@ ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, if (N->getOpcode() != ISD::SDIV) return SDValue(); - const auto &ST = DAG.getSubtarget<ARMSubtarget>(); + const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); const bool MinSize = ST.hasMinSize(); const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() : ST.hasDivideInARMMode(); @@ -10039,136 +9067,69 @@ void ARMTargetLowering::ExpandDIV_Windows( DAG.getConstant(32, dl, TLI.getPointerTy(DL))); Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper)); + Results.push_back(Lower); + Results.push_back(Upper); } static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); EVT MemVT = LD->getMemoryVT(); - assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || - MemVT == MVT::v16i1) && + assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && "Expected a predicate type!"); assert(MemVT == Op.getValueType()); assert(LD->getExtensionType() == ISD::NON_EXTLOAD && "Expected a non-extending load"); assert(LD->isUnindexed() && "Expected a unindexed load"); - // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit + // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We - // need to make sure that 8/4/2 bits are actually loaded into the correct + // need to make sure that 8/4 bits are actually loaded into the correct // place, which means loading the value and then shuffling the values into // the bottom bits of the predicate. // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect // for BE). - // Speaking of BE, apparently the rest of llvm will assume a reverse order to - // a natural VMSR(load), so needs to be reversed. SDLoc dl(Op); SDValue Load = DAG.getExtLoad( ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), LD->getMemOperand()); - SDValue Val = Load; - if (DAG.getDataLayout().isBigEndian()) - Val = DAG.getNode(ISD::SRL, dl, MVT::i32, - DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load), - DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32)); - SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val); + SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load); if (MemVT != MVT::v16i1) Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, DAG.getConstant(0, dl, MVT::i32)); return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); } -void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, - SelectionDAG &DAG) const { - LoadSDNode *LD = cast<LoadSDNode>(N); - EVT MemVT = LD->getMemoryVT(); - assert(LD->isUnindexed() && "Loads should be unindexed at this point."); - - if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && - !Subtarget->isThumb1Only() && LD->isVolatile()) { - SDLoc dl(N); - SDValue Result = DAG.getMemIntrinsicNode( - ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), - {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); - SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1); - SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0); - SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); - Results.append({Pair, Result.getValue(2)}); - } -} - static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); EVT MemVT = ST->getMemoryVT(); - assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || - MemVT == MVT::v16i1) && + assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && "Expected a predicate type!"); assert(MemVT == ST->getValue().getValueType()); assert(!ST->isTruncatingStore() && "Expected a non-extending store"); assert(ST->isUnindexed() && "Expected a unindexed store"); - // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with - // top bits unset and a scalar store. + // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits + // unset and a scalar store. SDLoc dl(Op); SDValue Build = ST->getValue(); if (MemVT != MVT::v16i1) { SmallVector<SDValue, 16> Ops; - for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) { - unsigned Elt = DAG.getDataLayout().isBigEndian() - ? MemVT.getVectorNumElements() - I - 1 - : I; + for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, - DAG.getConstant(Elt, dl, MVT::i32))); - } + DAG.getConstant(I, dl, MVT::i32))); for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) Ops.push_back(DAG.getUNDEF(MVT::i32)); Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); } SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); - if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian()) - GRP = DAG.getNode(ISD::SRL, dl, MVT::i32, - DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP), - DAG.getConstant(16, dl, MVT::i32)); return DAG.getTruncStore( ST->getChain(), dl, GRP, ST->getBasePtr(), EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), ST->getMemOperand()); } -static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { - StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); - EVT MemVT = ST->getMemoryVT(); - assert(ST->isUnindexed() && "Stores should be unindexed at this point."); - - if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && - !Subtarget->isThumb1Only() && ST->isVolatile()) { - SDNode *N = Op.getNode(); - SDLoc dl(N); - - SDValue Lo = DAG.getNode( - ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), - DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl, - MVT::i32)); - SDValue Hi = DAG.getNode( - ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), - DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl, - MVT::i32)); - - return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), - {ST->getChain(), Lo, Hi, ST->getBasePtr()}, - MemVT, ST->getMemOperand()); - } else if (Subtarget->hasMVEIntegerOps() && - ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || - MemVT == MVT::v16i1))) { - return LowerPredicateStore(Op, DAG); - } - - return SDValue(); -} - static bool isZeroVector(SDValue N) { return (ISD::isBuildVectorAllZeros(N.getNode()) || (N->getOpcode() == ARMISD::VMOVIMM && @@ -10194,89 +9155,15 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(), N->isExpandingLoad()); SDValue Combo = NewLoad; - bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST || - PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) && - isZeroVector(PassThru->getOperand(0)); - if (!PassThru.isUndef() && !PassThruIsCastZero) + if (!PassThru.isUndef() && + (PassThru.getOpcode() != ISD::BITCAST || + !isZeroVector(PassThru->getOperand(0)))) Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); } -static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *ST) { - if (!ST->hasMVEIntegerOps()) - return SDValue(); - - SDLoc dl(Op); - unsigned BaseOpcode = 0; - switch (Op->getOpcode()) { - default: llvm_unreachable("Expected VECREDUCE opcode"); - case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break; - case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break; - case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break; - case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break; - case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break; - case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break; - case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; - case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; - } - - SDValue Op0 = Op->getOperand(0); - EVT VT = Op0.getValueType(); - EVT EltVT = VT.getVectorElementType(); - unsigned NumElts = VT.getVectorNumElements(); - unsigned NumActiveLanes = NumElts; - - assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 || - NumActiveLanes == 2) && - "Only expected a power 2 vector size"); - - // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements - // allows us to easily extract vector elements from the lanes. - while (NumActiveLanes > 4) { - unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32; - SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0); - Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev); - NumActiveLanes /= 2; - } - - SDValue Res; - if (NumActiveLanes == 4) { - // The remaining 4 elements are summed sequentially - SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, - DAG.getConstant(0 * NumElts / 4, dl, MVT::i32)); - SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, - DAG.getConstant(1 * NumElts / 4, dl, MVT::i32)); - SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, - DAG.getConstant(2 * NumElts / 4, dl, MVT::i32)); - SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, - DAG.getConstant(3 * NumElts / 4, dl, MVT::i32)); - SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); - SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags()); - Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags()); - } else { - SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, - DAG.getConstant(0, dl, MVT::i32)); - SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, - DAG.getConstant(1, dl, MVT::i32)); - Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); - } - - // Result type may be wider than element type. - if (EltVT != Op->getValueType(0)) - Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res); - return Res; -} - -static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *ST) { - if (!ST->hasMVEFloatOps()) - return SDValue(); - return LowerVecReduce(Op, DAG, ST); -} - static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { - if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering())) + if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) // Acquire/Release load/store is not legal for targets without a dmb or // equivalent available. return SDValue(); @@ -10344,13 +9231,12 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N, bool isBigEndian = DAG.getDataLayout().isBigEndian(); - SDValue Lo = + Results.push_back( DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, - SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); - SDValue Hi = + SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); + Results.push_back( DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, - SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi)); + SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); Results.push_back(SDValue(CmpSwap, 2)); } @@ -10399,15 +9285,6 @@ SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues({Result, Chain}, dl); } -SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const { - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - - EVT VT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); - int FI = MFI.CreateFixedObject(4, 0, false); - return DAG.getFrameIndex(FI, VT); -} - SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { @@ -10431,8 +9308,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STRICT_FP_TO_UINT: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); - case ISD::FP_TO_SINT_SAT: - case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); @@ -10463,11 +9338,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); - case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget); - case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget); - case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG); - case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); + case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SDIV: if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) @@ -10487,25 +9358,13 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerUnsignedALUO(Op, DAG); case ISD::SADDSAT: case ISD::SSUBSAT: - case ISD::UADDSAT: - case ISD::USUBSAT: - return LowerADDSUBSAT(Op, DAG, Subtarget); + return LowerSADDSUBSAT(Op, DAG, Subtarget); case ISD::LOAD: return LowerPredicateLoad(Op, DAG); case ISD::STORE: - return LowerSTORE(Op, DAG, Subtarget); + return LowerPredicateStore(Op, DAG); case ISD::MLOAD: return LowerMLOAD(Op, DAG); - case ISD::VECREDUCE_MUL: - case ISD::VECREDUCE_AND: - case ISD::VECREDUCE_OR: - case ISD::VECREDUCE_XOR: - return LowerVecReduce(Op, DAG, Subtarget); - case ISD::VECREDUCE_FADD: - case ISD::VECREDUCE_FMUL: - case ISD::VECREDUCE_FMIN: - case ISD::VECREDUCE_FMAX: - return LowerVecReduceF(Op, DAG, Subtarget); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); @@ -10521,8 +9380,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG); - case ISD::SPONENTRY: - return LowerSPONENTRY(Op, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); } } @@ -10554,8 +9411,8 @@ static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, DAG.getVTList(MVT::i32, MVT::i32), N->getOperand(1), N->getOperand(2), Lo, Hi); - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, - LongMul.getValue(0), LongMul.getValue(1))); + Results.push_back(LongMul.getValue(0)); + Results.push_back(LongMul.getValue(1)); } /// ReplaceNodeResults - Replace the results of node with an illegal result @@ -10591,9 +9448,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, return; case ISD::SADDSAT: case ISD::SSUBSAT: - case ISD::UADDSAT: - case ISD::USUBSAT: - Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget); + Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget); break; case ISD::READCYCLECOUNTER: ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); @@ -10608,20 +9463,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, return; case ISD::INTRINSIC_WO_CHAIN: return ReplaceLongIntrinsic(N, Results, DAG); - case ISD::LOAD: - LowerLOAD(N, Results, DAG); - break; - case ISD::TRUNCATE: - Res = LowerTruncate(N, DAG, Subtarget); - break; - case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: - Res = LowerVectorExtend(N, DAG, Subtarget); - break; - case ISD::FP_TO_SINT_SAT: - case ISD::FP_TO_UINT_SAT: - Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget); - break; + case ISD::ABS: + lowerABS(N, Results, DAG); + return ; + } if (Res.getNode()) Results.push_back(Res); @@ -10654,7 +9499,7 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); - unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4)); + unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; @@ -10662,11 +9507,11 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), - MachineMemOperand::MOLoad, 4, Align(4)); + MachineMemOperand::MOLoad, 4, 4); MachineMemOperand *FIMMOSt = MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), - MachineMemOperand::MOStore, 4, Align(4)); + MachineMemOperand::MOStore, 4, 4); // Load the address of the dispatch MBB into the jump buffer. if (isThumb2) { @@ -10777,23 +9622,25 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // associated with. DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; unsigned MaxCSNum = 0; - for (MachineBasicBlock &BB : *MF) { - if (!BB.isEHPad()) - continue; + for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; + ++BB) { + if (!BB->isEHPad()) continue; // FIXME: We should assert that the EH_LABEL is the first MI in the landing // pad. - for (MachineInstr &II : BB) { - if (!II.isEHLabel()) - continue; + for (MachineBasicBlock::iterator + II = BB->begin(), IE = BB->end(); II != IE; ++II) { + if (!II->isEHLabel()) continue; - MCSymbol *Sym = II.getOperand(0).getMCSymbol(); + MCSymbol *Sym = II->getOperand(0).getMCSymbol(); if (!MF->hasCallSiteLandingPad(Sym)) continue; SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); - for (unsigned Idx : CallSiteIdxs) { - CallSiteNumToLPad[Idx].push_back(&BB); - MaxCSNum = std::max(MaxCSNum, Idx); + for (SmallVectorImpl<unsigned>::iterator + CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); + CSI != CSE; ++CSI) { + CallSiteNumToLPad[*CSI].push_back(&*BB); + MaxCSNum = std::max(MaxCSNum, *CSI); } break; } @@ -10805,9 +9652,10 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, LPadList.reserve(CallSiteNumToLPad.size()); for (unsigned I = 1; I <= MaxCSNum; ++I) { SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; - for (MachineBasicBlock *MBB : MBBList) { - LPadList.push_back(MBB); - InvokeBBs.insert(MBB->pred_begin(), MBB->pred_end()); + for (SmallVectorImpl<MachineBasicBlock*>::iterator + II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { + LPadList.push_back(*II); + InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); } } @@ -10849,7 +9697,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), - MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4)); + MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); MachineInstrBuilder MIB; MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); @@ -10940,8 +9788,10 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); + if (Align == 0) + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) @@ -10978,9 +9828,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(NewVReg3) .add(predOps(ARMCC::AL)); - MachineMemOperand *JTMMOLd = - MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), - MachineMemOperand::MOLoad, 4, Align(4)); + MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) @@ -11040,8 +9889,10 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); + if (Align == 0) + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) @@ -11071,9 +9922,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); - MachineMemOperand *JTMMOLd = - MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), - MachineMemOperand::MOLoad, 4, Align(4)); + MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) .addReg(NewVReg3, RegState::Kill) @@ -11096,7 +9946,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // Add the jump table entries as successors to the MBB. SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; - for (MachineBasicBlock *CurMBB : LPadList) { + for (std::vector<MachineBasicBlock*>::iterator + I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { + MachineBasicBlock *CurMBB = *I; if (SeenMBBs.insert(CurMBB).second) DispContBB->addSuccessor(CurMBB); } @@ -11108,7 +9960,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. - SmallVector<MachineBasicBlock*, 4> Successors(BB->successors()); + SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), + BB->succ_end()); while (!Successors.empty()) { MachineBasicBlock *SMBB = Successors.pop_back_val(); if (SMBB->isEHPad()) { @@ -11158,8 +10011,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // Mark all former landing pads as non-landing pads. The dispatch is the only // landing pad now. - for (MachineBasicBlock *MBBLPad : MBBLPads) - MBBLPad->setIsEHPad(false); + for (SmallVectorImpl<MachineBasicBlock*>::iterator + I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) + (*I)->setIsEHPad(false); // The instruction is gone now. MI.eraseFromParent(); @@ -11167,9 +10021,10 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, static MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { - for (MachineBasicBlock *S : MBB->successors()) - if (S != Succ) - return S; + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) + if (*I != Succ) + return *I; llvm_unreachable("Expecting a BB with two successors!"); } @@ -11307,7 +10162,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, Register dest = MI.getOperand(0).getReg(); Register src = MI.getOperand(1).getReg(); unsigned SizeVal = MI.getOperand(2).getImm(); - unsigned Alignment = MI.getOperand(3).getImm(); + unsigned Align = MI.getOperand(3).getImm(); DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); @@ -11320,17 +10175,17 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, bool IsThumb2 = Subtarget->isThumb2(); bool IsThumb = Subtarget->isThumb(); - if (Alignment & 1) { + if (Align & 1) { UnitSize = 1; - } else if (Alignment & 2) { + } else if (Align & 2) { UnitSize = 2; } else { // Check whether we can use NEON instructions. if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && Subtarget->hasNEON()) { - if ((Alignment % 16 == 0) && SizeVal >= 16) + if ((Align % 16 == 0) && SizeVal >= 16) UnitSize = 16; - else if ((Alignment % 8 == 0) && SizeVal >= 8) + else if ((Align % 8 == 0) && SizeVal >= 8) UnitSize = 8; } // Can't use NEON instructions. @@ -11436,11 +10291,13 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, const Constant *C = ConstantInt::get(Int32Ty, LoopSize); // MachineConstantPool wants an explicit alignment. - Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); + if (Align == 0) + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); MachineMemOperand *CPMMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), - MachineMemOperand::MOLoad, 4, Align(4)); + MachineMemOperand::MOLoad, 4, 4); if (IsThumb) BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) @@ -11590,7 +10447,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) .addExternalSymbol("__chkstk"); - BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent()))) + BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) .add(predOps(ARMCC::AL)) .addReg(Reg, RegState::Kill) .addReg(ARM::R4, RegState::Implicit | RegState::Kill) @@ -11667,9 +10524,13 @@ static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, // If we hit the end of the block, check whether CPSR is live into a // successor. if (miI == BB->end()) { - for (MachineBasicBlock *Succ : BB->successors()) - if (Succ->isLiveIn(ARM::CPSR)) + for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), + sEnd = BB->succ_end(); + sItr != sEnd; ++sItr) { + MachineBasicBlock* succ = *sItr; + if (succ->isLiveIn(ARM::CPSR)) return false; + } } // We found a def, or hit the end of the basic block and CPSR wasn't live @@ -11678,148 +10539,6 @@ static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, return true; } -/// Adds logic in loop entry MBB to calculate loop iteration count and adds -/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop -static Register genTPEntry(MachineBasicBlock *TpEntry, - MachineBasicBlock *TpLoopBody, - MachineBasicBlock *TpExit, Register OpSizeReg, - const TargetInstrInfo *TII, DebugLoc Dl, - MachineRegisterInfo &MRI) { - // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4. - Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); - BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg) - .addUse(OpSizeReg) - .addImm(15) - .add(predOps(ARMCC::AL)) - .addReg(0); - - Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); - BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg) - .addUse(AddDestReg, RegState::Kill) - .addImm(4) - .add(predOps(ARMCC::AL)) - .addReg(0); - - Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); - BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg) - .addUse(LsrDestReg, RegState::Kill); - - BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart)) - .addUse(TotalIterationsReg) - .addMBB(TpExit); - - BuildMI(TpEntry, Dl, TII->get(ARM::t2B)) - .addMBB(TpLoopBody) - .add(predOps(ARMCC::AL)); - - return TotalIterationsReg; -} - -/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and -/// t2DoLoopEnd. These are used by later passes to generate tail predicated -/// loops. -static void genTPLoopBody(MachineBasicBlock *TpLoopBody, - MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, - const TargetInstrInfo *TII, DebugLoc Dl, - MachineRegisterInfo &MRI, Register OpSrcReg, - Register OpDestReg, Register ElementCountReg, - Register TotalIterationsReg, bool IsMemcpy) { - // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest - // array, loop iteration counter, predication counter. - - Register SrcPhiReg, CurrSrcReg; - if (IsMemcpy) { - // Current position in the src array - SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); - CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); - BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg) - .addUse(OpSrcReg) - .addMBB(TpEntry) - .addUse(CurrSrcReg) - .addMBB(TpLoopBody); - } - - // Current position in the dest array - Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); - Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); - BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg) - .addUse(OpDestReg) - .addMBB(TpEntry) - .addUse(CurrDestReg) - .addMBB(TpLoopBody); - - // Current loop counter - Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); - Register RemainingLoopIterationsReg = - MRI.createVirtualRegister(&ARM::GPRlrRegClass); - BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg) - .addUse(TotalIterationsReg) - .addMBB(TpEntry) - .addUse(RemainingLoopIterationsReg) - .addMBB(TpLoopBody); - - // Predication counter - Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); - Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); - BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg) - .addUse(ElementCountReg) - .addMBB(TpEntry) - .addUse(RemainingElementsReg) - .addMBB(TpLoopBody); - - // Pass predication counter to VCTP - Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass); - BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg) - .addUse(PredCounterPhiReg) - .addImm(ARMVCC::None) - .addReg(0) - .addReg(0); - - BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg) - .addUse(PredCounterPhiReg) - .addImm(16) - .add(predOps(ARMCC::AL)) - .addReg(0); - - // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR - Register SrcValueReg; - if (IsMemcpy) { - SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass); - BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post)) - .addDef(CurrSrcReg) - .addDef(SrcValueReg) - .addReg(SrcPhiReg) - .addImm(16) - .addImm(ARMVCC::Then) - .addUse(VccrReg) - .addReg(0); - } else - SrcValueReg = OpSrcReg; - - BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post)) - .addDef(CurrDestReg) - .addUse(SrcValueReg) - .addReg(DestPhiReg) - .addImm(16) - .addImm(ARMVCC::Then) - .addUse(VccrReg) - .addReg(0); - - // Add the pseudoInstrs for decrementing the loop counter and marking the - // end:t2DoLoopDec and t2DoLoopEnd - BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg) - .addUse(LoopCounterPhiReg) - .addImm(1); - - BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd)) - .addUse(RemainingLoopIterationsReg) - .addMBB(TpLoopBody); - - BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B)) - .addMBB(TpExit) - .add(predOps(ARMCC::AL)); -} - MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -11846,98 +10565,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; } - case ARM::MVE_MEMCPYLOOPINST: - case ARM::MVE_MEMSETLOOPINST: { - - // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo - // into a Tail Predicated (TP) Loop. It adds the instructions to calculate - // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and - // adds the relevant instructions in the TP loop Body for generation of a - // WLSTP loop. - - // Below is relevant portion of the CFG after the transformation. - // The Machine Basic Blocks are shown along with branch conditions (in - // brackets). Note that TP entry/exit MBBs depict the entry/exit of this - // portion of the CFG and may not necessarily be the entry/exit of the - // function. - - // (Relevant) CFG after transformation: - // TP entry MBB - // | - // |-----------------| - // (n <= 0) (n > 0) - // | | - // | TP loop Body MBB<--| - // | | | - // \ |___________| - // \ / - // TP exit MBB - - MachineFunction *MF = BB->getParent(); - MachineFunctionProperties &Properties = MF->getProperties(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - Register OpDestReg = MI.getOperand(0).getReg(); - Register OpSrcReg = MI.getOperand(1).getReg(); - Register OpSizeReg = MI.getOperand(2).getReg(); - - // Allocate the required MBBs and add to parent function. - MachineBasicBlock *TpEntry = BB; - MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock(); - MachineBasicBlock *TpExit; - - MF->push_back(TpLoopBody); - - // If any instructions are present in the current block after - // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and - // move the instructions into the newly created exit block. If there are no - // instructions add an explicit branch to the FallThrough block and then - // split. - // - // The split is required for two reasons: - // 1) A terminator(t2WhileLoopStart) will be placed at that site. - // 2) Since a TPLoopBody will be added later, any phis in successive blocks - // need to be updated. splitAt() already handles this. - TpExit = BB->splitAt(MI, false); - if (TpExit == BB) { - assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the " - "block containing memcpy/memset Pseudo"); - TpExit = BB->getFallThrough(); - BuildMI(BB, dl, TII->get(ARM::t2B)) - .addMBB(TpExit) - .add(predOps(ARMCC::AL)); - TpExit = BB->splitAt(MI, false); - } - - // Add logic for iteration count - Register TotalIterationsReg = - genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI); - - // Add the vectorized (and predicated) loads/store instructions - bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST; - genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg, - OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy); - - // Required to avoid conflict with the MachineVerifier during testing. - Properties.reset(MachineFunctionProperties::Property::NoPHIs); - - // Connect the blocks - TpEntry->addSuccessor(TpLoopBody); - TpLoopBody->addSuccessor(TpLoopBody); - TpLoopBody->addSuccessor(TpExit); - - // Reorder for a more natural layout - TpLoopBody->moveAfter(TpEntry); - TpExit->moveAfter(TpLoopBody); - - // Finally, remove the memcpy Psuedo Instruction - MI.eraseFromParent(); - - // Return the exit block as it may contain other instructions requiring a - // custom inserter - return TpExit; - } - // The Thumb2 pre-indexed stores have the same MI operands, they just // define them differently in the .td files from the isel patterns, so // they need pseudos. @@ -11985,8 +10612,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); - for (const MachineOperand &MO : MI.operands()) - MIB.add(MO); + for (unsigned i = 0; i < MI.getNumOperands(); ++i) + MIB.add(MI.getOperand(i)); MI.eraseFromParent(); return BB; } @@ -12266,7 +10893,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, if (Subtarget->isThumb1Only()) { for (unsigned c = MCID->getNumOperands() - 4; c--;) { MI.addOperand(MI.getOperand(1)); - MI.removeOperand(1); + MI.RemoveOperand(1); } // Restore the ties @@ -12289,7 +10916,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, // Any ARM instruction that sets the 's' bit should specify an optional // "cc_out" operand in the last operand position. - if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) { + if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { assert(!NewOpc && "Optional cc_out operand required"); return; } @@ -12304,7 +10931,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, definesCPSR = true; if (MO.isDead()) deadCPSR = true; - MI.removeOperand(i); + MI.RemoveOperand(i); break; } } @@ -12375,7 +11002,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, // (zext cc) can never be the all ones value. if (AllOnes) return false; - [[fallthrough]]; + LLVM_FALLTHROUGH; case ISD::SIGN_EXTEND: { SDLoc dl(N); EVT VT = N->getValueType(0); @@ -12391,7 +11018,8 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, // When looking for a 0 constant, N can be zext or sext. OtherOp = DAG.getConstant(1, dl, VT); else - OtherOp = DAG.getAllOnesConstant(dl, VT); + OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, + VT); return true; } } @@ -12983,7 +11611,7 @@ static SDValue PerformAddcSubcCombine(SDNode *N, const ARMSubtarget *Subtarget) { SelectionDAG &DAG(DCI.DAG); - if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) { + if (N->getOpcode() == ARMISD::SUBC) { // (SUBC (ADDE 0, 0, C), 1) -> C SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -13039,333 +11667,20 @@ static SDValue PerformAddeSubeCombine(SDNode *N, return SDValue(); } -static SDValue PerformSELECTCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { - if (!Subtarget->hasMVEIntegerOps()) - return SDValue(); - - SDLoc dl(N); - SDValue SetCC; - SDValue LHS; - SDValue RHS; - ISD::CondCode CC; - SDValue TrueVal; - SDValue FalseVal; - - if (N->getOpcode() == ISD::SELECT && - N->getOperand(0)->getOpcode() == ISD::SETCC) { - SetCC = N->getOperand(0); - LHS = SetCC->getOperand(0); - RHS = SetCC->getOperand(1); - CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); - TrueVal = N->getOperand(1); - FalseVal = N->getOperand(2); - } else if (N->getOpcode() == ISD::SELECT_CC) { - LHS = N->getOperand(0); - RHS = N->getOperand(1); - CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); - TrueVal = N->getOperand(2); - FalseVal = N->getOperand(3); - } else { - return SDValue(); - } - - unsigned int Opcode = 0; - if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN || - FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) && - (CC == ISD::SETULT || CC == ISD::SETUGT)) { - Opcode = ARMISD::VMINVu; - if (CC == ISD::SETUGT) - std::swap(TrueVal, FalseVal); - } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN || - FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) && - (CC == ISD::SETLT || CC == ISD::SETGT)) { - Opcode = ARMISD::VMINVs; - if (CC == ISD::SETGT) - std::swap(TrueVal, FalseVal); - } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX || - FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) && - (CC == ISD::SETUGT || CC == ISD::SETULT)) { - Opcode = ARMISD::VMAXVu; - if (CC == ISD::SETULT) - std::swap(TrueVal, FalseVal); - } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX || - FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) && - (CC == ISD::SETGT || CC == ISD::SETLT)) { - Opcode = ARMISD::VMAXVs; - if (CC == ISD::SETLT) - std::swap(TrueVal, FalseVal); - } else - return SDValue(); - - // Normalise to the right hand side being the vector reduction - switch (TrueVal->getOpcode()) { - case ISD::VECREDUCE_UMIN: - case ISD::VECREDUCE_SMIN: - case ISD::VECREDUCE_UMAX: - case ISD::VECREDUCE_SMAX: - std::swap(LHS, RHS); - std::swap(TrueVal, FalseVal); - break; - } - - EVT VectorType = FalseVal->getOperand(0).getValueType(); - - if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 && - VectorType != MVT::v4i32) - return SDValue(); - - EVT VectorScalarType = VectorType.getVectorElementType(); - - // The values being selected must also be the ones being compared - if (TrueVal != LHS || FalseVal != RHS) - return SDValue(); - - EVT LeftType = LHS->getValueType(0); - EVT RightType = RHS->getValueType(0); - - // The types must match the reduced type too - if (LeftType != VectorScalarType || RightType != VectorScalarType) - return SDValue(); - - // Legalise the scalar to an i32 - if (VectorScalarType != MVT::i32) - LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); - - // Generate the reduction as an i32 for legalisation purposes - auto Reduction = - DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0)); - - // The result isn't actually an i32 so truncate it back to its original type - if (VectorScalarType != MVT::i32) - Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction); - - return Reduction; -} - -// A special combine for the vqdmulh family of instructions. This is one of the -// potential set of patterns that could patch this instruction. The base pattern -// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))). -// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))), -// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as -// the max is unnecessary. -static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - SDValue Shft; - ConstantSDNode *Clamp; - - if (!VT.isVector() || VT.getScalarSizeInBits() > 64) - return SDValue(); - - if (N->getOpcode() == ISD::SMIN) { - Shft = N->getOperand(0); - Clamp = isConstOrConstSplat(N->getOperand(1)); - } else if (N->getOpcode() == ISD::VSELECT) { - // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin. - SDValue Cmp = N->getOperand(0); - if (Cmp.getOpcode() != ISD::SETCC || - cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT || - Cmp.getOperand(0) != N->getOperand(1) || - Cmp.getOperand(1) != N->getOperand(2)) - return SDValue(); - Shft = N->getOperand(1); - Clamp = isConstOrConstSplat(N->getOperand(2)); - } else - return SDValue(); - - if (!Clamp) - return SDValue(); - - MVT ScalarType; - int ShftAmt = 0; - switch (Clamp->getSExtValue()) { - case (1 << 7) - 1: - ScalarType = MVT::i8; - ShftAmt = 7; - break; - case (1 << 15) - 1: - ScalarType = MVT::i16; - ShftAmt = 15; - break; - case (1ULL << 31) - 1: - ScalarType = MVT::i32; - ShftAmt = 31; - break; - default: - return SDValue(); - } - - if (Shft.getOpcode() != ISD::SRA) - return SDValue(); - ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1)); - if (!N1 || N1->getSExtValue() != ShftAmt) - return SDValue(); - - SDValue Mul = Shft.getOperand(0); - if (Mul.getOpcode() != ISD::MUL) - return SDValue(); - - SDValue Ext0 = Mul.getOperand(0); - SDValue Ext1 = Mul.getOperand(1); - if (Ext0.getOpcode() != ISD::SIGN_EXTEND || - Ext1.getOpcode() != ISD::SIGN_EXTEND) - return SDValue(); - EVT VecVT = Ext0.getOperand(0).getValueType(); - if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1) - return SDValue(); - if (Ext1.getOperand(0).getValueType() != VecVT || - VecVT.getScalarType() != ScalarType || - VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2) - return SDValue(); - - SDLoc DL(Mul); - unsigned LegalLanes = 128 / (ShftAmt + 1); - EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes); - // For types smaller than legal vectors extend to be legal and only use needed - // lanes. - if (VecVT.getSizeInBits() < 128) { - EVT ExtVecVT = - MVT::getVectorVT(MVT::getIntegerVT(128 / VecVT.getVectorNumElements()), - VecVT.getVectorNumElements()); - SDValue Inp0 = - DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0)); - SDValue Inp1 = - DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0)); - Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0); - Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1); - SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1); - SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH); - Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc); - return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc); - } - - // For larger types, split into legal sized chunks. - assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type"); - unsigned NumParts = VecVT.getSizeInBits() / 128; - SmallVector<SDValue> Parts; - for (unsigned I = 0; I < NumParts; ++I) { - SDValue Inp0 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0), - DAG.getVectorIdxConstant(I * LegalLanes, DL)); - SDValue Inp1 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0), - DAG.getVectorIdxConstant(I * LegalLanes, DL)); - SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1); - Parts.push_back(VQDMULH); - } - return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, - DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts)); -} - -static SDValue PerformVSELECTCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { - if (!Subtarget->hasMVEIntegerOps()) - return SDValue(); - - if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG)) - return V; - - // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs). - // - // We need to re-implement this optimization here as the implementation in the - // Target-Independent DAGCombiner does not handle the kind of constant we make - // (it calls isConstOrConstSplat with AllowTruncation set to false - and for - // good reason, allowing truncation there would break other targets). - // - // Currently, this is only done for MVE, as it's the only target that benefits - // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL). - if (N->getOperand(0).getOpcode() != ISD::XOR) - return SDValue(); - SDValue XOR = N->getOperand(0); - - // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s. - // It is important to check with truncation allowed as the BUILD_VECTORs we - // generate in those situations will truncate their operands. - ConstantSDNode *Const = - isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false, - /*AllowTruncation*/ true); - if (!Const || !Const->isOne()) - return SDValue(); - - // Rewrite into vselect(cond, rhs, lhs). - SDValue Cond = XOR->getOperand(0); - SDValue LHS = N->getOperand(1); - SDValue RHS = N->getOperand(2); - EVT Type = N->getValueType(0); - return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS); -} - -// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n -static SDValue PerformVSetCCToVCTPCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); - EVT VT = N->getValueType(0); - - if (!Subtarget->hasMVEIntegerOps() || - !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) - return SDValue(); - - if (CC == ISD::SETUGE) { - std::swap(Op0, Op1); - CC = ISD::SETULT; - } - - if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 || - Op0.getOpcode() != ISD::BUILD_VECTOR) - return SDValue(); - - // Check first operand is BuildVector of 0,1,2,... - for (unsigned I = 0; I < VT.getVectorNumElements(); I++) { - if (!Op0.getOperand(I).isUndef() && - !(isa<ConstantSDNode>(Op0.getOperand(I)) && - Op0.getConstantOperandVal(I) == I)) - return SDValue(); - } - - // The second is a Splat of Op1S - SDValue Op1S = DCI.DAG.getSplatValue(Op1); - if (!Op1S) - return SDValue(); - - unsigned Opc; - switch (VT.getVectorNumElements()) { - case 2: - Opc = Intrinsic::arm_mve_vctp64; - break; - case 4: - Opc = Intrinsic::arm_mve_vctp32; - break; - case 8: - Opc = Intrinsic::arm_mve_vctp16; - break; - case 16: - Opc = Intrinsic::arm_mve_vctp8; - break; - default: - return SDValue(); - } - - SDLoc DL(N); - return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DCI.DAG.getConstant(Opc, DL, MVT::i32), - DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32)); -} - static SDValue PerformABSCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SDValue res; SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) return SDValue(); - return TLI.expandABS(N, DAG); + if (!TLI.expandABS(N, res, DAG)) + return SDValue(); + + return res; } /// PerformADDECombine - Target-specific dag combine transform from @@ -13409,248 +11724,9 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, return SDValue(); } -static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc dl(N); - - auto IsVecReduce = [](SDValue Op) { - switch (Op.getOpcode()) { - case ISD::VECREDUCE_ADD: - case ARMISD::VADDVs: - case ARMISD::VADDVu: - case ARMISD::VMLAVs: - case ARMISD::VMLAVu: - return true; - } - return false; - }; - - auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) { - // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) -> - // add(add(X, vecreduce(Y)), vecreduce(Z)) - // to make better use of vaddva style instructions. - if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) && - IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) && - !isa<ConstantSDNode>(N0) && N1->hasOneUse()) { - SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0)); - return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1)); - } - // And turn add(add(A, reduce(B)), add(C, reduce(D))) -> - // add(add(add(A, C), reduce(B)), reduce(D)) - if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD && - N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) { - unsigned N0RedOp = 0; - if (!IsVecReduce(N0.getOperand(N0RedOp))) { - N0RedOp = 1; - if (!IsVecReduce(N0.getOperand(N0RedOp))) - return SDValue(); - } - - unsigned N1RedOp = 0; - if (!IsVecReduce(N1.getOperand(N1RedOp))) - N1RedOp = 1; - if (!IsVecReduce(N1.getOperand(N1RedOp))) - return SDValue(); - - SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp), - N1.getOperand(1 - N1RedOp)); - SDValue Add1 = - DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp)); - return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp)); - } - return SDValue(); - }; - if (SDValue R = DistrubuteAddAddVecReduce(N0, N1)) - return R; - if (SDValue R = DistrubuteAddAddVecReduce(N1, N0)) - return R; - - // Distribute add(vecreduce(load(Y)), vecreduce(load(Z))) - // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z))) - // by ascending load offsets. This can help cores prefetch if the order of - // loads is more predictable. - auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) { - // Check if two reductions are known to load data where one is before/after - // another. Return negative if N0 loads data before N1, positive if N1 is - // before N0 and 0 otherwise if nothing is known. - auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) { - // Look through to the first operand of a MUL, for the VMLA case. - // Currently only looks at the first operand, in the hope they are equal. - if (N0.getOpcode() == ISD::MUL) - N0 = N0.getOperand(0); - if (N1.getOpcode() == ISD::MUL) - N1 = N1.getOperand(0); - - // Return true if the two operands are loads to the same object and the - // offset of the first is known to be less than the offset of the second. - LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0); - LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1); - if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() || - !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() || - Load1->isIndexed()) - return 0; - - auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG); - auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG); - - if (!BaseLocDecomp0.getBase() || - BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() || - !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset()) - return 0; - if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset()) - return -1; - if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset()) - return 1; - return 0; - }; - - SDValue X; - if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) { - if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) { - int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0), - N0.getOperand(1).getOperand(0)); - if (IsBefore < 0) { - X = N0.getOperand(0); - N0 = N0.getOperand(1); - } else if (IsBefore > 0) { - X = N0.getOperand(1); - N0 = N0.getOperand(0); - } else - return SDValue(); - } else if (IsVecReduce(N0.getOperand(0))) { - X = N0.getOperand(1); - N0 = N0.getOperand(0); - } else if (IsVecReduce(N0.getOperand(1))) { - X = N0.getOperand(0); - N0 = N0.getOperand(1); - } else - return SDValue(); - } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) && - IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) { - // Note this is backward to how you would expect. We create - // add(reduce(load + 16), reduce(load + 0)) so that the - // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving - // the X as VADDV(load + 0) - return DAG.getNode(ISD::ADD, dl, VT, N1, N0); - } else - return SDValue(); - - if (!IsVecReduce(N0) || !IsVecReduce(N1)) - return SDValue(); - - if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0) - return SDValue(); - - // Switch from add(add(X, N0), N1) to add(add(X, N1), N0) - SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1); - return DAG.getNode(ISD::ADD, dl, VT, Add0, N0); - }; - if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true)) - return R; - if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false)) - return R; - return SDValue(); -} - -static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { - if (!Subtarget->hasMVEIntegerOps()) - return SDValue(); - - if (SDValue R = TryDistrubutionADDVecReduce(N, DAG)) - return R; - - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc dl(N); - - if (VT != MVT::i64) - return SDValue(); - - // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this - // will look like: - // t1: i32,i32 = ARMISD::VADDLVs x - // t2: i64 = build_pair t1, t1:1 - // t3: i64 = add t2, y - // Otherwise we try to push the add up above VADDLVAx, to potentially allow - // the add to be simplified seperately. - // We also need to check for sext / zext and commutitive adds. - auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA, - SDValue NB) { - if (NB->getOpcode() != ISD::BUILD_PAIR) - return SDValue(); - SDValue VecRed = NB->getOperand(0); - if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) || - VecRed.getResNo() != 0 || - NB->getOperand(1) != SDValue(VecRed.getNode(), 1)) - return SDValue(); - - if (VecRed->getOpcode() == OpcodeA) { - // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y) - SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, - VecRed.getOperand(0), VecRed.getOperand(1)); - NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA); - } - - SmallVector<SDValue, 4> Ops; - Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, - DAG.getConstant(0, dl, MVT::i32))); - Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, - DAG.getConstant(1, dl, MVT::i32))); - unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0; - for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++) - Ops.push_back(VecRed->getOperand(I)); - SDValue Red = - DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops); - return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red, - SDValue(Red.getNode(), 1)); - }; - - if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0)) - return M; - return SDValue(); -} - bool ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const { - assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || - N->getOpcode() == ISD::SRL) && - "Expected shift op"); - if (Level == BeforeLegalizeTypes) return true; @@ -13684,38 +11760,8 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, return false; } -bool ARMTargetLowering::isDesirableToCommuteXorWithShift( - const SDNode *N) const { - assert(N->getOpcode() == ISD::XOR && - (N->getOperand(0).getOpcode() == ISD::SHL || - N->getOperand(0).getOpcode() == ISD::SRL) && - "Expected XOR(SHIFT) pattern"); - - // Only commute if the entire NOT mask is a hidden shifted mask. - auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1)); - auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)); - if (XorC && ShiftC) { - unsigned MaskIdx, MaskLen; - if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) { - unsigned ShiftAmt = ShiftC->getZExtValue(); - unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); - if (N->getOperand(0).getOpcode() == ISD::SHL) - return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt); - return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt); - } - } - - return false; -} - bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N, CombineLevel Level) const { - assert(((N->getOpcode() == ISD::SHL && - N->getOperand(0).getOpcode() == ISD::SRL) || - (N->getOpcode() == ISD::SRL && - N->getOperand(0).getOpcode() == ISD::SHL)) && - "Expected shift-shift mask"); - if (!Subtarget->isThumb1Only()) return true; @@ -13734,26 +11780,6 @@ bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { return VT.isScalarInteger(); } -bool ARMTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT, - EVT VT) const { - if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple()) - return false; - - switch (FPVT.getSimpleVT().SimpleTy) { - case MVT::f16: - return Subtarget->hasVFP2Base(); - case MVT::f32: - return Subtarget->hasVFP2Base(); - case MVT::f64: - return Subtarget->hasFP64(); - case MVT::v4f32: - case MVT::v8f16: - return Subtarget->hasMVEFloatOps(); - default: - return false; - } -} - static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { @@ -13781,7 +11807,7 @@ static SDValue PerformSHLSimplify(SDNode *N, return SDValue(); // Check that all the users could perform the shl themselves. - for (auto *U : N->uses()) { + for (auto U : N->uses()) { switch(U->getOpcode()) { default: return SDValue(); @@ -13823,13 +11849,10 @@ static SDValue PerformSHLSimplify(SDNode *N, APInt C2Int = C2->getAPIntValue(); APInt C1Int = C1ShlC2->getAPIntValue(); - unsigned C2Width = C2Int.getBitWidth(); - if (C2Int.uge(C2Width)) - return SDValue(); - uint64_t C2Value = C2Int.getZExtValue(); // Check that performing a lshr will not lose any information. - APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value); + APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), + C2Int.getBitWidth() - C2->getZExtValue()); if ((C1Int & Mask) != C1Int) return SDValue(); @@ -13872,9 +11895,6 @@ static SDValue PerformADDCombine(SDNode *N, if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; - if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget)) - return Result; - // First try with the default operand order. if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) return Result; @@ -13883,26 +11903,6 @@ static SDValue PerformADDCombine(SDNode *N, return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); } -// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC) -// providing -X is as cheap as X (currently, just a constant). -static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) { - if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0))) - return SDValue(); - SDValue CSINC = N->getOperand(1); - if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse()) - return SDValue(); - - ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0)); - if (!X) - return SDValue(); - - return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32, - DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0), - CSINC.getOperand(0)), - CSINC.getOperand(1), CSINC.getOperand(2), - CSINC.getOperand(3)); -} - /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. /// static SDValue PerformSUBCombine(SDNode *N, @@ -13916,9 +11916,6 @@ static SDValue PerformSUBCombine(SDNode *N, if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) return Result; - if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG)) - return R; - if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) return SDValue(); @@ -13989,86 +11986,18 @@ static SDValue PerformVMULCombine(SDNode *N, DAG.getNode(ISD::MUL, DL, VT, N01, N1)); } -static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { - EVT VT = N->getValueType(0); - if (VT != MVT::v2i64) - return SDValue(); - - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - auto IsSignExt = [&](SDValue Op) { - if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG) - return SDValue(); - EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT(); - if (VT.getScalarSizeInBits() == 32) - return Op->getOperand(0); - return SDValue(); - }; - auto IsZeroExt = [&](SDValue Op) { - // Zero extends are a little more awkward. At the point we are matching - // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask. - // That might be before of after a bitcast depending on how the and is - // placed. Because this has to look through bitcasts, it is currently only - // supported on LE. - if (!Subtarget->isLittle()) - return SDValue(); - - SDValue And = Op; - if (And->getOpcode() == ISD::BITCAST) - And = And->getOperand(0); - if (And->getOpcode() != ISD::AND) - return SDValue(); - SDValue Mask = And->getOperand(1); - if (Mask->getOpcode() == ISD::BITCAST) - Mask = Mask->getOperand(0); - - if (Mask->getOpcode() != ISD::BUILD_VECTOR || - Mask.getValueType() != MVT::v4i32) - return SDValue(); - if (isAllOnesConstant(Mask->getOperand(0)) && - isNullConstant(Mask->getOperand(1)) && - isAllOnesConstant(Mask->getOperand(2)) && - isNullConstant(Mask->getOperand(3))) - return And->getOperand(0); - return SDValue(); - }; - - SDLoc dl(N); - if (SDValue Op0 = IsSignExt(N0)) { - if (SDValue Op1 = IsSignExt(N1)) { - SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); - SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); - return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a); - } - } - if (SDValue Op0 = IsZeroExt(N0)) { - if (SDValue Op1 = IsZeroExt(N1)) { - SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); - SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); - return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a); - } - } - - return SDValue(); -} - static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; - EVT VT = N->getValueType(0); - if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64) - return PerformMVEVMULLCombine(N, DAG, Subtarget); - if (Subtarget->isThumb1Only()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); + EVT VT = N->getValueType(0); if (VT.is64BitVector() || VT.is128BitVector()) return PerformVMULCombine(N, DCI, Subtarget); if (VT != MVT::i32) @@ -14253,21 +12182,20 @@ static SDValue PerformANDCombine(SDNode *N, EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 || - VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1) + if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; - if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && + if (BVN && Subtarget->hasNEON() && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || - SplatBitSize == 64) { + if (SplatBitSize <= 64) { EVT VbicVT; SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VbicVT, VT, OtherModImm); + DAG, dl, VbicVT, VT.is128BitVector(), + OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); @@ -14497,43 +12425,58 @@ static bool isValidMVECond(unsigned CC, bool IsFloat) { }; } -static ARMCC::CondCodes getVCMPCondCode(SDValue N) { - if (N->getOpcode() == ARMISD::VCMP) - return (ARMCC::CondCodes)N->getConstantOperandVal(2); - else if (N->getOpcode() == ARMISD::VCMPZ) - return (ARMCC::CondCodes)N->getConstantOperandVal(1); - else - llvm_unreachable("Not a VCMP/VCMPZ!"); -} - -static bool CanInvertMVEVCMP(SDValue N) { - ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N)); - return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint()); -} - -static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, +static SDValue PerformORCombine_i1(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain // together with predicates EVT VT = N->getValueType(0); - SDLoc DL(N); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - auto IsFreelyInvertable = [&](SDValue V) { - if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ) - return CanInvertMVEVCMP(V); - return false; - }; - - // At least one operand must be freely invertable. - if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1))) - return SDValue(); - - SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT); - SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT); - SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1); - return DAG.getLogicalNOT(DL, And, VT); + ARMCC::CondCodes CondCode0 = ARMCC::AL; + ARMCC::CondCodes CondCode1 = ARMCC::AL; + if (N0->getOpcode() == ARMISD::VCMP) + CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2)) + ->getZExtValue(); + else if (N0->getOpcode() == ARMISD::VCMPZ) + CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1)) + ->getZExtValue(); + if (N1->getOpcode() == ARMISD::VCMP) + CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2)) + ->getZExtValue(); + else if (N1->getOpcode() == ARMISD::VCMPZ) + CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1)) + ->getZExtValue(); + + if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) + return SDValue(); + + unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); + unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); + + if (!isValidMVECond(Opposite0, + N0->getOperand(0)->getValueType(0).isFloatingPoint()) || + !isValidMVECond(Opposite1, + N1->getOperand(0)->getValueType(0).isFloatingPoint())) + return SDValue(); + + SmallVector<SDValue, 4> Ops0; + Ops0.push_back(N0->getOperand(0)); + if (N0->getOpcode() == ARMISD::VCMP) + Ops0.push_back(N0->getOperand(1)); + Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); + SmallVector<SDValue, 4> Ops1; + Ops1.push_back(N1->getOperand(0)); + if (N1->getOpcode() == ARMISD::VCMP) + Ops1.push_back(N1->getOperand(1)); + Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); + + SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); + SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); + SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); + return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, + DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); } /// PerformORCombine - Target-specific dag combine xforms for ISD::OR @@ -14549,21 +12492,17 @@ static SDValue PerformORCombine(SDNode *N, if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 || - VT == MVT::v8i1 || VT == MVT::v16i1)) - return PerformORCombine_i1(N, DAG, Subtarget); - APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; - if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && + if (BVN && Subtarget->hasNEON() && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || - SplatBitSize == 64) { + if (SplatBitSize <= 64) { EVT VorrVT; - SDValue Val = - isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), - SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm); + SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, dl, VorrVT, VT.is128BitVector(), + OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); @@ -14614,7 +12553,7 @@ static SDValue PerformORCombine(SDNode *N, // Canonicalize the vector type to make instruction selection // simpler. EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; - SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT, + SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, N0->getOperand(1), N0->getOperand(0), N1->getOperand(0)); @@ -14624,6 +12563,10 @@ static SDValue PerformORCombine(SDNode *N, } } + if (Subtarget->hasMVEIntegerOps() && + (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) + return PerformORCombine_i1(N, DCI, Subtarget); + // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when // reasonable. if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { @@ -14655,27 +12598,6 @@ static SDValue PerformXORCombine(SDNode *N, return Result; } - if (Subtarget->hasMVEIntegerOps()) { - // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition. - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - const TargetLowering *TLI = Subtarget->getTargetLowering(); - if (TLI->isConstTrueVal(N1) && - (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) { - if (CanInvertMVEVCMP(N0)) { - SDLoc DL(N0); - ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0)); - - SmallVector<SDValue, 4> Ops; - Ops.push_back(N0->getOperand(0)); - if (N0->getOpcode() == ARMISD::VCMP) - Ops.push_back(N0->getOperand(1)); - Ops.push_back(DAG.getConstant(CC, DL, MVT::i32)); - return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops); - } - } - } - return SDValue(); } @@ -14712,40 +12634,52 @@ static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { } static SDValue FindBFIToCombineWith(SDNode *N) { - // We have a BFI in N. Find a BFI it can combine with, if one exists. + // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, + // if one exists. APInt ToMask, FromMask; SDValue From = ParseBFI(N, ToMask, FromMask); SDValue To = N->getOperand(0); + // Now check for a compatible BFI to merge with. We can pass through BFIs that + // aren't compatible, but not if they set the same bit in their destination as + // we do (or that of any BFI we're going to combine with). SDValue V = To; - if (V.getOpcode() != ARMISD::BFI) - return SDValue(); + APInt CombinedToMask = ToMask; + while (V.getOpcode() == ARMISD::BFI) { + APInt NewToMask, NewFromMask; + SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); + if (NewFrom != From) { + // This BFI has a different base. Keep going. + CombinedToMask |= NewToMask; + V = V.getOperand(0); + continue; + } - APInt NewToMask, NewFromMask; - SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); - if (NewFrom != From) - return SDValue(); + // Do the written bits conflict with any we've seen so far? + if ((NewToMask & CombinedToMask).getBoolValue()) + // Conflicting bits - bail out because going further is unsafe. + return SDValue(); - // Do the written bits conflict with any we've seen so far? - if ((NewToMask & ToMask).getBoolValue()) - // Conflicting bits. - return SDValue(); + // Are the new bits contiguous when combined with the old bits? + if (BitsProperlyConcatenate(ToMask, NewToMask) && + BitsProperlyConcatenate(FromMask, NewFromMask)) + return V; + if (BitsProperlyConcatenate(NewToMask, ToMask) && + BitsProperlyConcatenate(NewFromMask, FromMask)) + return V; - // Are the new bits contiguous when combined with the old bits? - if (BitsProperlyConcatenate(ToMask, NewToMask) && - BitsProperlyConcatenate(FromMask, NewFromMask)) - return V; - if (BitsProperlyConcatenate(NewToMask, ToMask) && - BitsProperlyConcatenate(NewFromMask, FromMask)) - return V; + // We've seen a write to some bits, so track it. + CombinedToMask |= NewToMask; + // Keep going... + V = V.getOperand(0); + } return SDValue(); } -static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) { - SDValue N0 = N->getOperand(0); +static SDValue PerformBFICombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::AND) { // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff // the bits being cleared by the AND are not demanded by the BFI. @@ -14754,20 +12688,24 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); unsigned LSB = countTrailingZeros(~InvMask); - unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB; + unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; assert(Width < static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && "undefined behavior"); unsigned Mask = (1u << Width) - 1; unsigned Mask2 = N11C->getZExtValue(); if ((Mask & (~Mask2)) == 0) - return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), - N->getOperand(0), N1.getOperand(0), N->getOperand(2)); - return SDValue(); - } + return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), + N->getOperand(0), N1.getOperand(0), + N->getOperand(2)); + } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { + // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. + // Keep track of any consecutive bits set that all come from the same base + // value. We can combine these together into a single BFI. + SDValue CombineBFI = FindBFIToCombineWith(N); + if (CombineBFI == SDValue()) + return SDValue(); - // Look for another BFI to combine with. - if (SDValue CombineBFI = FindBFIToCombineWith(N)) { // We've found a BFI. APInt ToMask1, FromMask1; SDValue From1 = ParseBFI(N, ToMask1, FromMask1); @@ -14777,7 +12715,9 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) { assert(From1 == From2); (void)From2; - // Create a new BFI, combining the two together. + // First, unlink CombineBFI. + DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); + // Then create a new BFI, combining the two together. APInt NewFromMask = FromMask1 | FromMask2; APInt NewToMask = ToMask1 | ToMask2; @@ -14785,101 +12725,11 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); if (NewFromMask[0] == 0) - From1 = DAG.getNode( - ISD::SRL, dl, VT, From1, - DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); - return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1, - DAG.getConstant(~NewToMask, dl, VT)); - } - - // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so - // that lower bit insertions are performed first, providing that M1 and M2 - // do no overlap. This can allow multiple BFI instructions to be combined - // together by the other folds above. - if (N->getOperand(0).getOpcode() == ARMISD::BFI) { - APInt ToMask1 = ~N->getConstantOperandAPInt(2); - APInt ToMask2 = ~N0.getConstantOperandAPInt(2); - - if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 || - ToMask1.countLeadingZeros() < ToMask2.countLeadingZeros()) - return SDValue(); - - EVT VT = N->getValueType(0); - SDLoc dl(N); - SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0), - N->getOperand(1), N->getOperand(2)); - return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1), - N0.getOperand(2)); - } - - return SDValue(); -} - -// Check that N is CMPZ(CSINC(0, 0, CC, X)), -// or CMPZ(CMOV(1, 0, CC, $cpsr, X)) -// return X if valid. -static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) { - if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1))) - return SDValue(); - SDValue CSInc = Cmp->getOperand(0); - - // Ignore any `And 1` nodes that may not yet have been removed. We are - // looking for a value that produces 1/0, so these have no effect on the - // code. - while (CSInc.getOpcode() == ISD::AND && - isa<ConstantSDNode>(CSInc.getOperand(1)) && - CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse()) - CSInc = CSInc.getOperand(0); - - if (CSInc.getOpcode() == ARMISD::CSINC && - isNullConstant(CSInc.getOperand(0)) && - isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) { - CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2); - return CSInc.getOperand(3); - } - if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) && - isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) { - CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2); - return CSInc.getOperand(4); - } - if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) && - isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) { - CC = ARMCC::getOppositeCondition( - (ARMCC::CondCodes)CSInc.getConstantOperandVal(2)); - return CSInc.getOperand(4); - } - return SDValue(); -} - -static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) { - // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in - // t92: glue = ARMISD::CMPZ t74, 0 - // t93: i32 = ARMISD::CSINC 0, 0, 1, t92 - // t96: glue = ARMISD::CMPZ t93, 0 - // t114: i32 = ARMISD::CSINV 0, 0, 0, t96 - ARMCC::CondCodes Cond; - if (SDValue C = IsCMPZCSINC(N, Cond)) - if (Cond == ARMCC::EQ) - return C; - return SDValue(); -} - -static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG) { - // Fold away an unneccessary CMPZ/CSINC - // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) -> - // if C1==EQ -> CSXYZ A, B, C2, D - // if C1==NE -> CSXYZ A, B, NOT(C2), D - ARMCC::CondCodes Cond; - if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) { - if (N->getConstantOperandVal(2) == ARMCC::EQ) - return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), - N->getOperand(1), - DAG.getConstant(Cond, SDLoc(N), MVT::i32), C); - if (N->getConstantOperandVal(2) == ARMCC::NE) - return DAG.getNode( - N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), - N->getOperand(1), - DAG.getConstant(ARMCC::getOppositeCondition(Cond), SDLoc(N), MVT::i32), C); + From1 = DCI.DAG.getNode( + ISD::SRL, dl, VT, From1, + DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); + return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, + DCI.DAG.getConstant(~NewToMask, dl, VT)); } return SDValue(); } @@ -14908,14 +12758,14 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, SDValue BasePtr = LD->getBasePtr(); SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), - LD->getAlign(), LD->getMemOperand()->getFlags()); + LD->getAlignment(), LD->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, LD->getPointerInfo().getWithOffset(4), - commonAlignment(LD->getAlign(), 4), + std::min(4U, LD->getAlignment()), LD->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); @@ -14925,54 +12775,6 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, return Result; } - // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d - // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b - if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa<ConstantSDNode>(InDouble.getOperand(1))) { - SDValue BV = InDouble.getOperand(0); - // Look up through any nop bitcasts and vector_reg_casts. bitcasts may - // change lane order under big endian. - bool BVSwap = BV.getOpcode() == ISD::BITCAST; - while ( - (BV.getOpcode() == ISD::BITCAST || - BV.getOpcode() == ARMISD::VECTOR_REG_CAST) && - (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) { - BVSwap = BV.getOpcode() == ISD::BITCAST; - BV = BV.getOperand(0); - } - if (BV.getValueType() != MVT::v4i32) - return SDValue(); - - // Handle buildvectors, pulling out the correct lane depending on - // endianness. - unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0; - if (BV.getOpcode() == ISD::BUILD_VECTOR) { - SDValue Op0 = BV.getOperand(Offset); - SDValue Op1 = BV.getOperand(Offset + 1); - if (!Subtarget->isLittle() && BVSwap) - std::swap(Op0, Op1); - - return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); - } - - // A chain of insert_vectors, grabbing the correct value of the chain of - // inserts. - SDValue Op0, Op1; - while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) { - if (isa<ConstantSDNode>(BV.getOperand(2))) { - if (BV.getConstantOperandVal(2) == Offset) - Op0 = BV.getOperand(1); - if (BV.getConstantOperandVal(2) == Offset + 1) - Op1 = BV.getOperand(1); - } - BV = BV.getOperand(0); - } - if (!Subtarget->isLittle() && BVSwap) - std::swap(Op0, Op1); - if (Op0 && Op1) - return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); - } - return SDValue(); } @@ -14994,84 +12796,6 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue PerformVMOVhrCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - SDValue Op0 = N->getOperand(0); - - // VMOVhr (VMOVrh (X)) -> X - if (Op0->getOpcode() == ARMISD::VMOVrh) - return Op0->getOperand(0); - - // FullFP16: half values are passed in S-registers, and we don't - // need any of the bitcast and moves: - // - // t2: f32,ch = CopyFromReg t0, Register:f32 %0 - // t5: i32 = bitcast t2 - // t18: f16 = ARMISD::VMOVhr t5 - if (Op0->getOpcode() == ISD::BITCAST) { - SDValue Copy = Op0->getOperand(0); - if (Copy.getValueType() == MVT::f32 && - Copy->getOpcode() == ISD::CopyFromReg) { - SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)}; - SDValue NewCopy = - DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops); - return NewCopy; - } - } - - // fold (VMOVhr (load x)) -> (load (f16*)x) - if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) { - if (LN0->hasOneUse() && LN0->isUnindexed() && - LN0->getMemoryVT() == MVT::i16) { - SDValue Load = - DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(), - LN0->getBasePtr(), LN0->getMemOperand()); - DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); - DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1)); - return Load; - } - } - - // Only the bottom 16 bits of the source register are used. - APInt DemandedMask = APInt::getLowBitsSet(32, 16); - const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI)) - return SDValue(N, 0); - - return SDValue(); -} - -static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG) { - SDValue N0 = N->getOperand(0); - EVT VT = N->getValueType(0); - - // fold (VMOVrh (fpconst x)) -> const x - if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) { - APFloat V = C->getValueAPF(); - return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT); - } - - // fold (VMOVrh (load x)) -> (zextload (i16*)x) - if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { - LoadSDNode *LN0 = cast<LoadSDNode>(N0); - - SDValue Load = - DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), - LN0->getBasePtr(), MVT::i16, LN0->getMemOperand()); - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); - return Load; - } - - // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n) - if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa<ConstantSDNode>(N0->getOperand(1))) - return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0), - N0->getOperand(1)); - - return SDValue(); -} - /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node /// are normal, non-volatile loads. If so, it is profitable to bitcast an /// i64 vector to have f64 elements, since the value can then be loaded @@ -15222,55 +12946,15 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // If the valuetypes are the same, we can remove the cast entirely. if (Op->getOperand(0).getValueType() == VT) return Op->getOperand(0); - return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); + return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, + Op->getOperand(0).getValueType(), Op->getOperand(0)); } - // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce - // more VPNOT which might get folded as else predicates. - if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) { - SDValue X = - DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); - SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, - DCI.DAG.getConstant(65535, dl, MVT::i32)); - return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C); - } - - // Only the bottom 16 bits of the source register are used. - if (Op.getValueType() == MVT::i32) { - APInt DemandedMask = APInt::getLowBitsSet(32, 16); - const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) - return SDValue(N, 0); - } return SDValue(); } -static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - EVT VT = N->getValueType(0); - SDValue Op = N->getOperand(0); - SDLoc dl(N); - - // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST - if (ST->isLittle()) - return DAG.getNode(ISD::BITCAST, dl, VT, Op); - - // VECTOR_REG_CAST undef -> undef - if (Op.isUndef()) - return DAG.getUNDEF(VT); - - // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x) - if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) { - // If the valuetypes are the same, we can remove the cast entirely. - if (Op->getOperand(0).getValueType() == VT) - return Op->getOperand(0); - return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0)); - } - - return SDValue(); -} - -static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, +static SDValue PerformVCMPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (!Subtarget->hasMVEIntegerOps()) return SDValue(); @@ -15284,18 +12968,19 @@ static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, // vcmp X, 0, cc -> vcmpz X, cc if (isZeroVector(Op1)) - return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2)); + return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, + N->getOperand(2)); unsigned SwappedCond = getSwappedCondition(Cond); if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) { // vcmp 0, X, cc -> vcmpz X, reversed(cc) if (isZeroVector(Op0)) - return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, - DAG.getConstant(SwappedCond, dl, MVT::i32)); + return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, + DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc) if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP) - return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, - DAG.getConstant(SwappedCond, dl, MVT::i32)); + return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, + DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); } return SDValue(); @@ -15327,265 +13012,9 @@ static SDValue PerformInsertEltCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); } -// Convert a pair of extracts from the same base vector to a VMOVRRD. Either -// directly or bitcast to an integer if the original is a float vector. -// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2) -// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2) -static SDValue -PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { - EVT VT = N->getValueType(0); - SDLoc dl(N); - - if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 || - !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64)) - return SDValue(); - - SDValue Ext = SDValue(N, 0); - if (Ext.getOpcode() == ISD::BITCAST && - Ext.getOperand(0).getValueType() == MVT::f32) - Ext = Ext.getOperand(0); - if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isa<ConstantSDNode>(Ext.getOperand(1)) || - Ext.getConstantOperandVal(1) % 2 != 0) - return SDValue(); - if (Ext->use_size() == 1 && - (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP || - Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP)) - return SDValue(); - - SDValue Op0 = Ext.getOperand(0); - EVT VecVT = Op0.getValueType(); - unsigned ResNo = Op0.getResNo(); - unsigned Lane = Ext.getConstantOperandVal(1); - if (VecVT.getVectorNumElements() != 4) - return SDValue(); - - // Find another extract, of Lane + 1 - auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) { - return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa<ConstantSDNode>(V->getOperand(1)) && - V->getConstantOperandVal(1) == Lane + 1 && - V->getOperand(0).getResNo() == ResNo; - }); - if (OtherIt == Op0->uses().end()) - return SDValue(); - - // For float extracts, we need to be converting to a i32 for both vector - // lanes. - SDValue OtherExt(*OtherIt, 0); - if (OtherExt.getValueType() != MVT::i32) { - if (OtherExt->use_size() != 1 || - OtherExt->use_begin()->getOpcode() != ISD::BITCAST || - OtherExt->use_begin()->getValueType(0) != MVT::i32) - return SDValue(); - OtherExt = SDValue(*OtherExt->use_begin(), 0); - } - - // Convert the type to a f64 and extract with a VMOVRRD. - SDValue F64 = DCI.DAG.getNode( - ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, - DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0), - DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32)); - SDValue VMOVRRD = - DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64); - - DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1)); - return VMOVRRD; -} - -static SDValue PerformExtractEltCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *ST) { - SDValue Op0 = N->getOperand(0); - EVT VT = N->getValueType(0); - SDLoc dl(N); - - // extract (vdup x) -> x - if (Op0->getOpcode() == ARMISD::VDUP) { - SDValue X = Op0->getOperand(0); - if (VT == MVT::f16 && X.getValueType() == MVT::i32) - return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X); - if (VT == MVT::i32 && X.getValueType() == MVT::f16) - return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X); - if (VT == MVT::f32 && X.getValueType() == MVT::i32) - return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X); - - while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST) - X = X->getOperand(0); - if (X.getValueType() == VT) - return X; - } - - // extract ARM_BUILD_VECTOR -> x - if (Op0->getOpcode() == ARMISD::BUILD_VECTOR && - isa<ConstantSDNode>(N->getOperand(1)) && - N->getConstantOperandVal(1) < Op0.getNumOperands()) { - return Op0.getOperand(N->getConstantOperandVal(1)); - } - - // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b - if (Op0.getValueType() == MVT::v4i32 && - isa<ConstantSDNode>(N->getOperand(1)) && - Op0.getOpcode() == ISD::BITCAST && - Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && - Op0.getOperand(0).getValueType() == MVT::v2f64) { - SDValue BV = Op0.getOperand(0); - unsigned Offset = N->getConstantOperandVal(1); - SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1); - if (MOV.getOpcode() == ARMISD::VMOVDRR) - return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2); - } - - // extract x, n; extract x, n+1 -> VMOVRRD x - if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) - return R; - - // extract (MVETrunc(x)) -> extract x - if (Op0->getOpcode() == ARMISD::MVETRUNC) { - unsigned Idx = N->getConstantOperandVal(1); - unsigned Vec = - Idx / Op0->getOperand(0).getValueType().getVectorNumElements(); - unsigned SubIdx = - Idx % Op0->getOperand(0).getValueType().getVectorNumElements(); - return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec), - DCI.DAG.getConstant(SubIdx, dl, MVT::i32)); - } - - return SDValue(); -} - -static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) { - SDValue Op = N->getOperand(0); - EVT VT = N->getValueType(0); - - // sext_inreg(VGETLANEu) -> VGETLANEs - if (Op.getOpcode() == ARMISD::VGETLANEu && - cast<VTSDNode>(N->getOperand(1))->getVT() == - Op.getOperand(0).getValueType().getScalarType()) - return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0), - Op.getOperand(1)); - - return SDValue(); -} - -// When lowering complex nodes that we recognize, like VQDMULH and MULH, we -// can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to -// binop as the shuffles cancel out. -static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT) - return SDValue(); - SDValue Op = N->getOperand(0); - - // Looking for binary operators that will have been folded from - // truncates/extends. - switch (Op.getOpcode()) { - case ARMISD::VQDMULH: - case ISD::MULHS: - case ISD::MULHU: - case ISD::ABDS: - case ISD::ABDU: - case ISD::AVGFLOORS: - case ISD::AVGFLOORU: - case ISD::AVGCEILS: - case ISD::AVGCEILU: - break; - default: - return SDValue(); - } - - ShuffleVectorSDNode *Op0 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0)); - ShuffleVectorSDNode *Op1 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1)); - if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() || - !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() || - Op0->getOperand(0).getValueType() != VT) - return SDValue(); - - // Check the mask turns into an identity shuffle. - ArrayRef<int> NMask = N->getMask(); - ArrayRef<int> OpMask = Op0->getMask(); - for (int i = 0, e = NMask.size(); i != e; i++) { - if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i) - return SDValue(); - } - - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - Op0->getOperand(0), Op1->getOperand(0)); -} - -static SDValue -PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { - SDValue Vec = N->getOperand(0); - SDValue SubVec = N->getOperand(1); - uint64_t IdxVal = N->getConstantOperandVal(2); - EVT VecVT = Vec.getValueType(); - EVT SubVT = SubVec.getValueType(); - - // Only do this for legal fixed vector types. - if (!VecVT.isFixedLengthVector() || - !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) || - !DCI.DAG.getTargetLoweringInfo().isTypeLegal(SubVT)) - return SDValue(); - - // Ignore widening patterns. - if (IdxVal == 0 && Vec.isUndef()) - return SDValue(); - - // Subvector must be half the width and an "aligned" insertion. - unsigned NumSubElts = SubVT.getVectorNumElements(); - if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() || - (IdxVal != 0 && IdxVal != NumSubElts)) - return SDValue(); - - // Fold insert_subvector -> concat_vectors - // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi)) - // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub) - SDLoc DL(N); - SDValue Lo, Hi; - if (IdxVal == 0) { - Lo = SubVec; - Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, - DCI.DAG.getVectorIdxConstant(NumSubElts, DL)); - } else { - Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, - DCI.DAG.getVectorIdxConstant(0, DL)); - Hi = SubVec; - } - return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi); -} - -// shuffle(MVETrunc(x, y)) -> VMOVN(x, y) -static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, - SelectionDAG &DAG) { - SDValue Trunc = N->getOperand(0); - EVT VT = Trunc.getValueType(); - if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef()) - return SDValue(); - - SDLoc DL(Trunc); - if (isVMOVNTruncMask(N->getMask(), VT, false)) - return DAG.getNode( - ARMISD::VMOVN, DL, VT, - DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)), - DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)), - DAG.getConstant(1, DL, MVT::i32)); - else if (isVMOVNTruncMask(N->getMask(), VT, true)) - return DAG.getNode( - ARMISD::VMOVN, DL, VT, - DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)), - DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)), - DAG.getConstant(1, DL, MVT::i32)); - return SDValue(); -} - /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for /// ISD::VECTOR_SHUFFLE. static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { - if (SDValue R = FlattenVectorShuffle(cast<ShuffleVectorSDNode>(N), DAG)) - return R; - if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG)) - return R; - // The LLVM shufflevector instruction does not require the shuffle mask // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the @@ -15635,388 +13064,6 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { DAG.getUNDEF(VT), NewMask); } -/// Load/store instruction that can be merged with a base address -/// update -struct BaseUpdateTarget { - SDNode *N; - bool isIntrinsic; - bool isStore; - unsigned AddrOpIdx; -}; - -struct BaseUpdateUser { - /// Instruction that updates a pointer - SDNode *N; - /// Pointer increment operand - SDValue Inc; - /// Pointer increment value if it is a constant, or 0 otherwise - unsigned ConstInc; -}; - -static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, - struct BaseUpdateUser &User, - bool SimpleConstIncOnly, - TargetLowering::DAGCombinerInfo &DCI) { - SelectionDAG &DAG = DCI.DAG; - SDNode *N = Target.N; - MemSDNode *MemN = cast<MemSDNode>(N); - SDLoc dl(N); - - // Find the new opcode for the updating load/store. - bool isLoadOp = true; - bool isLaneOp = false; - // Workaround for vst1x and vld1x intrinsics which do not have alignment - // as an operand. - bool hasAlignment = true; - unsigned NewOpc = 0; - unsigned NumVecs = 0; - if (Target.isIntrinsic) { - unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); - switch (IntNo) { - default: - llvm_unreachable("unexpected intrinsic for Neon base update"); - case Intrinsic::arm_neon_vld1: - NewOpc = ARMISD::VLD1_UPD; - NumVecs = 1; - break; - case Intrinsic::arm_neon_vld2: - NewOpc = ARMISD::VLD2_UPD; - NumVecs = 2; - break; - case Intrinsic::arm_neon_vld3: - NewOpc = ARMISD::VLD3_UPD; - NumVecs = 3; - break; - case Intrinsic::arm_neon_vld4: - NewOpc = ARMISD::VLD4_UPD; - NumVecs = 4; - break; - case Intrinsic::arm_neon_vld1x2: - NewOpc = ARMISD::VLD1x2_UPD; - NumVecs = 2; - hasAlignment = false; - break; - case Intrinsic::arm_neon_vld1x3: - NewOpc = ARMISD::VLD1x3_UPD; - NumVecs = 3; - hasAlignment = false; - break; - case Intrinsic::arm_neon_vld1x4: - NewOpc = ARMISD::VLD1x4_UPD; - NumVecs = 4; - hasAlignment = false; - break; - case Intrinsic::arm_neon_vld2dup: - NewOpc = ARMISD::VLD2DUP_UPD; - NumVecs = 2; - break; - case Intrinsic::arm_neon_vld3dup: - NewOpc = ARMISD::VLD3DUP_UPD; - NumVecs = 3; - break; - case Intrinsic::arm_neon_vld4dup: - NewOpc = ARMISD::VLD4DUP_UPD; - NumVecs = 4; - break; - case Intrinsic::arm_neon_vld2lane: - NewOpc = ARMISD::VLD2LN_UPD; - NumVecs = 2; - isLaneOp = true; - break; - case Intrinsic::arm_neon_vld3lane: - NewOpc = ARMISD::VLD3LN_UPD; - NumVecs = 3; - isLaneOp = true; - break; - case Intrinsic::arm_neon_vld4lane: - NewOpc = ARMISD::VLD4LN_UPD; - NumVecs = 4; - isLaneOp = true; - break; - case Intrinsic::arm_neon_vst1: - NewOpc = ARMISD::VST1_UPD; - NumVecs = 1; - isLoadOp = false; - break; - case Intrinsic::arm_neon_vst2: - NewOpc = ARMISD::VST2_UPD; - NumVecs = 2; - isLoadOp = false; - break; - case Intrinsic::arm_neon_vst3: - NewOpc = ARMISD::VST3_UPD; - NumVecs = 3; - isLoadOp = false; - break; - case Intrinsic::arm_neon_vst4: - NewOpc = ARMISD::VST4_UPD; - NumVecs = 4; - isLoadOp = false; - break; - case Intrinsic::arm_neon_vst2lane: - NewOpc = ARMISD::VST2LN_UPD; - NumVecs = 2; - isLoadOp = false; - isLaneOp = true; - break; - case Intrinsic::arm_neon_vst3lane: - NewOpc = ARMISD::VST3LN_UPD; - NumVecs = 3; - isLoadOp = false; - isLaneOp = true; - break; - case Intrinsic::arm_neon_vst4lane: - NewOpc = ARMISD::VST4LN_UPD; - NumVecs = 4; - isLoadOp = false; - isLaneOp = true; - break; - case Intrinsic::arm_neon_vst1x2: - NewOpc = ARMISD::VST1x2_UPD; - NumVecs = 2; - isLoadOp = false; - hasAlignment = false; - break; - case Intrinsic::arm_neon_vst1x3: - NewOpc = ARMISD::VST1x3_UPD; - NumVecs = 3; - isLoadOp = false; - hasAlignment = false; - break; - case Intrinsic::arm_neon_vst1x4: - NewOpc = ARMISD::VST1x4_UPD; - NumVecs = 4; - isLoadOp = false; - hasAlignment = false; - break; - } - } else { - isLaneOp = true; - switch (N->getOpcode()) { - default: - llvm_unreachable("unexpected opcode for Neon base update"); - case ARMISD::VLD1DUP: - NewOpc = ARMISD::VLD1DUP_UPD; - NumVecs = 1; - break; - case ARMISD::VLD2DUP: - NewOpc = ARMISD::VLD2DUP_UPD; - NumVecs = 2; - break; - case ARMISD::VLD3DUP: - NewOpc = ARMISD::VLD3DUP_UPD; - NumVecs = 3; - break; - case ARMISD::VLD4DUP: - NewOpc = ARMISD::VLD4DUP_UPD; - NumVecs = 4; - break; - case ISD::LOAD: - NewOpc = ARMISD::VLD1_UPD; - NumVecs = 1; - isLaneOp = false; - break; - case ISD::STORE: - NewOpc = ARMISD::VST1_UPD; - NumVecs = 1; - isLaneOp = false; - isLoadOp = false; - break; - } - } - - // Find the size of memory referenced by the load/store. - EVT VecTy; - if (isLoadOp) { - VecTy = N->getValueType(0); - } else if (Target.isIntrinsic) { - VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType(); - } else { - assert(Target.isStore && - "Node has to be a load, a store, or an intrinsic!"); - VecTy = N->getOperand(1).getValueType(); - } - - bool isVLDDUPOp = - NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD || - NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD; - - unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; - if (isLaneOp || isVLDDUPOp) - NumBytes /= VecTy.getVectorNumElements(); - - if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) { - // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two - // separate instructions that make it harder to use a non-constant update. - return false; - } - - if (SimpleConstIncOnly && User.ConstInc != NumBytes) - return false; - - // OK, we found an ADD we can fold into the base update. - // Now, create a _UPD node, taking care of not breaking alignment. - - EVT AlignedVecTy = VecTy; - Align Alignment = MemN->getAlign(); - - // If this is a less-than-standard-aligned load/store, change the type to - // match the standard alignment. - // The alignment is overlooked when selecting _UPD variants; and it's - // easier to introduce bitcasts here than fix that. - // There are 3 ways to get to this base-update combine: - // - intrinsics: they are assumed to be properly aligned (to the standard - // alignment of the memory type), so we don't need to do anything. - // - ARMISD::VLDx nodes: they are only generated from the aforementioned - // intrinsics, so, likewise, there's nothing to do. - // - generic load/store instructions: the alignment is specified as an - // explicit operand, rather than implicitly as the standard alignment - // of the memory type (like the intrisics). We need to change the - // memory type to match the explicit alignment. That way, we don't - // generate non-standard-aligned ARMISD::VLDx nodes. - if (isa<LSBaseSDNode>(N)) { - if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) { - MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8); - assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); - assert(!isLaneOp && "Unexpected generic load/store lane."); - unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); - AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); - } - // Don't set an explicit alignment on regular load/stores that we want - // to transform to VLD/VST 1_UPD nodes. - // This matches the behavior of regular load/stores, which only get an - // explicit alignment if the MMO alignment is larger than the standard - // alignment of the memory type. - // Intrinsics, however, always get an explicit alignment, set to the - // alignment of the MMO. - Alignment = Align(1); - } - - // Create the new updating load/store node. - // First, create an SDVTList for the new updating node's results. - EVT Tys[6]; - unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); - unsigned n; - for (n = 0; n < NumResultVecs; ++n) - Tys[n] = AlignedVecTy; - Tys[n++] = MVT::i32; - Tys[n] = MVT::Other; - SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2)); - - // Then, gather the new node's operands. - SmallVector<SDValue, 8> Ops; - Ops.push_back(N->getOperand(0)); // incoming chain - Ops.push_back(N->getOperand(Target.AddrOpIdx)); - Ops.push_back(User.Inc); - - if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { - // Try to match the intrinsic's signature - Ops.push_back(StN->getValue()); - } else { - // Loads (and of course intrinsics) match the intrinsics' signature, - // so just add all but the alignment operand. - unsigned LastOperand = - hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands(); - for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i) - Ops.push_back(N->getOperand(i)); - } - - // For all node types, the alignment operand is always the last one. - Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32)); - - // If this is a non-standard-aligned STORE, the penultimate operand is the - // stored value. Bitcast it to the aligned type. - if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { - SDValue &StVal = Ops[Ops.size() - 2]; - StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); - } - - EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; - SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, - MemN->getMemOperand()); - - // Update the uses. - SmallVector<SDValue, 5> NewResults; - for (unsigned i = 0; i < NumResultVecs; ++i) - NewResults.push_back(SDValue(UpdN.getNode(), i)); - - // If this is an non-standard-aligned LOAD, the first result is the loaded - // value. Bitcast it to the expected result type. - if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { - SDValue &LdVal = NewResults[0]; - LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); - } - - NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain - DCI.CombineTo(N, NewResults); - DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs)); - - return true; -} - -// If (opcode ptr inc) is and ADD-like instruction, return the -// increment value. Otherwise return 0. -static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, - SDValue Inc, const SelectionDAG &DAG) { - ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); - if (!CInc) - return 0; - - switch (Opcode) { - case ARMISD::VLD1_UPD: - case ISD::ADD: - return CInc->getZExtValue(); - case ISD::OR: { - if (DAG.haveNoCommonBitsSet(Ptr, Inc)) { - // (OR ptr inc) is the same as (ADD ptr inc) - return CInc->getZExtValue(); - } - return 0; - } - default: - return 0; - } -} - -static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) { - switch (N->getOpcode()) { - case ISD::ADD: - case ISD::OR: { - if (isa<ConstantSDNode>(N->getOperand(1))) { - *Ptr = N->getOperand(0); - *CInc = N->getOperand(1); - return true; - } - return false; - } - case ARMISD::VLD1_UPD: { - if (isa<ConstantSDNode>(N->getOperand(2))) { - *Ptr = N->getOperand(1); - *CInc = N->getOperand(2); - return true; - } - return false; - } - default: - return false; - } -} - -static bool isValidBaseUpdate(SDNode *N, SDNode *User) { - // Check that the add is independent of the load/store. - // Otherwise, folding it would create a cycle. Search through Addr - // as well, since the User may not be a direct user of Addr and - // only share a base pointer. - SmallPtrSet<const SDNode *, 32> Visited; - SmallVector<const SDNode *, 16> Worklist; - Worklist.push_back(N); - Worklist.push_back(User); - if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || - SDNode::hasPredecessorHelper(User, Visited, Worklist)) - return false; - return true; -} - /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, /// NEON load/store intrinsics, and generic vector load/stores, to merge /// base address updates. @@ -16024,125 +13071,18 @@ static bool isValidBaseUpdate(SDNode *N, SDNode *User) { /// The caller is assumed to have checked legality. static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || N->getOpcode() == ISD::INTRINSIC_W_CHAIN); const bool isStore = N->getOpcode() == ISD::STORE; const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); - BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx}; - SDValue Addr = N->getOperand(AddrOpIdx); - - SmallVector<BaseUpdateUser, 8> BaseUpdates; - - // Search for a use of the address operand that is an increment. - for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), - UE = Addr.getNode()->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - if (UI.getUse().getResNo() != Addr.getResNo() || - User->getNumOperands() != 2) - continue; - - SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1); - unsigned ConstInc = - getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG); - - if (ConstInc || User->getOpcode() == ISD::ADD) - BaseUpdates.push_back({User, Inc, ConstInc}); - } - - // If the address is a constant pointer increment itself, find - // another constant increment that has the same base operand - SDValue Base; - SDValue CInc; - if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) { - unsigned Offset = - getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG); - for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end(); - UI != UE; ++UI) { - - SDNode *User = *UI; - if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() || - User->getNumOperands() != 2) - continue; - - SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0); - unsigned UserOffset = - getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG); - - if (!UserOffset || UserOffset <= Offset) - continue; - - unsigned NewConstInc = UserOffset - Offset; - SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32); - BaseUpdates.push_back({User, NewInc, NewConstInc}); - } - } - - // Try to fold the load/store with an update that matches memory - // access size. This should work well for sequential loads. - // - // Filter out invalid updates as well. - unsigned NumValidUpd = BaseUpdates.size(); - for (unsigned I = 0; I < NumValidUpd;) { - BaseUpdateUser &User = BaseUpdates[I]; - if (!isValidBaseUpdate(N, User.N)) { - --NumValidUpd; - std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]); - continue; - } - - if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI)) - return SDValue(); - ++I; - } - BaseUpdates.resize(NumValidUpd); - - // Try to fold with other users. Non-constant updates are considered - // first, and constant updates are sorted to not break a sequence of - // strided accesses (if there is any). - std::stable_sort(BaseUpdates.begin(), BaseUpdates.end(), - [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) { - return LHS.ConstInc < RHS.ConstInc; - }); - for (BaseUpdateUser &User : BaseUpdates) { - if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI)) - return SDValue(); - } - return SDValue(); -} - -static SDValue PerformVLDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) - return SDValue(); - - return CombineBaseUpdate(N, DCI); -} - -static SDValue PerformMVEVLDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - SDValue Addr = N->getOperand(2); MemSDNode *MemN = cast<MemSDNode>(N); SDLoc dl(N); - // For the stores, where there are multiple intrinsics we only actually want - // to post-inc the last of the them. - unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); - if (IntNo == Intrinsic::arm_mve_vst2q && - cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1) - return SDValue(); - if (IntNo == Intrinsic::arm_mve_vst4q && - cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3) - return SDValue(); - // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), - UE = Addr.getNode()->use_end(); - UI != UE; ++UI) { + UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) @@ -16162,46 +13102,126 @@ static SDValue PerformMVEVLDCombine(SDNode *N, // Find the new opcode for the updating load/store. bool isLoadOp = true; + bool isLaneOp = false; unsigned NewOpc = 0; unsigned NumVecs = 0; - switch (IntNo) { - default: - llvm_unreachable("unexpected intrinsic for MVE VLDn combine"); - case Intrinsic::arm_mve_vld2q: - NewOpc = ARMISD::VLD2_UPD; - NumVecs = 2; - break; - case Intrinsic::arm_mve_vld4q: - NewOpc = ARMISD::VLD4_UPD; - NumVecs = 4; - break; - case Intrinsic::arm_mve_vst2q: - NewOpc = ARMISD::VST2_UPD; - NumVecs = 2; - isLoadOp = false; - break; - case Intrinsic::arm_mve_vst4q: - NewOpc = ARMISD::VST4_UPD; - NumVecs = 4; - isLoadOp = false; - break; + if (isIntrinsic) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: llvm_unreachable("unexpected intrinsic for Neon base update"); + case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; break; + case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; + NumVecs = 2; break; + case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; + NumVecs = 3; break; + case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; + NumVecs = 4; break; + case Intrinsic::arm_neon_vld2dup: + case Intrinsic::arm_neon_vld3dup: + case Intrinsic::arm_neon_vld4dup: + // TODO: Support updating VLDxDUP nodes. For now, we just skip + // combining base updates for such intrinsics. + continue; + case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; + NumVecs = 2; isLaneOp = true; break; + case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; + NumVecs = 3; isLaneOp = true; break; + case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; + NumVecs = 4; isLaneOp = true; break; + case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLoadOp = false; break; + case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; + NumVecs = 2; isLoadOp = false; break; + case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; + NumVecs = 3; isLoadOp = false; break; + case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; + NumVecs = 4; isLoadOp = false; break; + case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; + NumVecs = 2; isLoadOp = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; + NumVecs = 3; isLoadOp = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; + NumVecs = 4; isLoadOp = false; isLaneOp = true; break; + } + } else { + isLaneOp = true; + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected opcode for Neon base update"); + case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; + case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; + case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; + case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; + case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; isLaneOp = false; break; + case ISD::STORE: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLaneOp = false; isLoadOp = false; break; + } } // Find the size of memory referenced by the load/store. EVT VecTy; if (isLoadOp) { VecTy = N->getValueType(0); + } else if (isIntrinsic) { + VecTy = N->getOperand(AddrOpIdx+1).getValueType(); } else { - VecTy = N->getOperand(3).getValueType(); + assert(isStore && "Node has to be a load, a store, or an intrinsic!"); + VecTy = N->getOperand(1).getValueType(); } unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (isLaneOp) + NumBytes /= VecTy.getVectorNumElements(); // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); - if (!CInc || CInc->getZExtValue() != NumBytes) + if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { + // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two + // separate instructions that make it harder to use a non-constant update. continue; + } + + // OK, we found an ADD we can fold into the base update. + // Now, create a _UPD node, taking care of not breaking alignment. + + EVT AlignedVecTy = VecTy; + unsigned Alignment = MemN->getAlignment(); + + // If this is a less-than-standard-aligned load/store, change the type to + // match the standard alignment. + // The alignment is overlooked when selecting _UPD variants; and it's + // easier to introduce bitcasts here than fix that. + // There are 3 ways to get to this base-update combine: + // - intrinsics: they are assumed to be properly aligned (to the standard + // alignment of the memory type), so we don't need to do anything. + // - ARMISD::VLDx nodes: they are only generated from the aforementioned + // intrinsics, so, likewise, there's nothing to do. + // - generic load/store instructions: the alignment is specified as an + // explicit operand, rather than implicitly as the standard alignment + // of the memory type (like the intrisics). We need to change the + // memory type to match the explicit alignment. That way, we don't + // generate non-standard-aligned ARMISD::VLDx nodes. + if (isa<LSBaseSDNode>(N)) { + if (Alignment == 0) + Alignment = 1; + if (Alignment < VecTy.getScalarSizeInBits() / 8) { + MVT EltTy = MVT::getIntegerVT(Alignment * 8); + assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); + assert(!isLaneOp && "Unexpected generic load/store lane."); + unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); + AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); + } + // Don't set an explicit alignment on regular load/stores that we want + // to transform to VLD/VST 1_UPD nodes. + // This matches the behavior of regular load/stores, which only get an + // explicit alignment if the MMO alignment is larger than the standard + // alignment of the memory type. + // Intrinsics, however, always get an explicit alignment, set to the + // alignment of the MMO. + Alignment = 1; + } // Create the new updating load/store node. // First, create an SDVTList for the new updating node's results. @@ -16209,21 +13229,39 @@ static SDValue PerformMVEVLDCombine(SDNode *N, unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); unsigned n; for (n = 0; n < NumResultVecs; ++n) - Tys[n] = VecTy; + Tys[n] = AlignedVecTy; Tys[n++] = MVT::i32; Tys[n] = MVT::Other; - SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2)); + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); // Then, gather the new node's operands. SmallVector<SDValue, 8> Ops; Ops.push_back(N->getOperand(0)); // incoming chain - Ops.push_back(N->getOperand(2)); // ptr + Ops.push_back(N->getOperand(AddrOpIdx)); Ops.push_back(Inc); - for (unsigned i = 3; i < N->getNumOperands(); ++i) - Ops.push_back(N->getOperand(i)); + if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { + // Try to match the intrinsic's signature + Ops.push_back(StN->getValue()); + } else { + // Loads (and of course intrinsics) match the intrinsics' signature, + // so just add all but the alignment operand. + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) + Ops.push_back(N->getOperand(i)); + } + + // For all node types, the alignment operand is always the last one. + Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); + + // If this is a non-standard-aligned STORE, the penultimate operand is the + // stored value. Bitcast it to the aligned type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { + SDValue &StVal = Ops[Ops.size()-2]; + StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); + } - SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy, + EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, MemN->getMemOperand()); // Update the uses. @@ -16231,16 +13269,30 @@ static SDValue PerformMVEVLDCombine(SDNode *N, for (unsigned i = 0; i < NumResultVecs; ++i) NewResults.push_back(SDValue(UpdN.getNode(), i)); - NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain + // If this is an non-standard-aligned LOAD, the first result is the loaded + // value. Bitcast it to the expected result type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { + SDValue &LdVal = NewResults[0]; + LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); + } + + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain DCI.CombineTo(N, NewResults); DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); break; } - return SDValue(); } +static SDValue PerformVLDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + return CombineBaseUpdate(N, DCI); +} + /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and @@ -16293,7 +13345,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { for (n = 0; n < NumVecs; ++n) Tys[n] = VT; Tys[n] = MVT::Other; - SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1)); + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, @@ -16325,21 +13377,8 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { /// PerformVDUPLANECombine - Target-specific dag combine xforms for /// ARMISD::VDUPLANE. static SDValue PerformVDUPLANECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { + TargetLowering::DAGCombinerInfo &DCI) { SDValue Op = N->getOperand(0); - EVT VT = N->getValueType(0); - - // On MVE, we just convert the VDUPLANE to a VDUP with an extract. - if (Subtarget->hasMVEIntegerOps()) { - EVT ExtractVT = VT.getVectorElementType(); - // We need to ensure we are creating a legal type. - if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT)) - ExtractVT = MVT::i32; - SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT, - N->getOperand(0), N->getOperand(1)); - return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract); - } // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. @@ -16360,6 +13399,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N, unsigned EltBits; if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) EltSize = 8; + EVT VT = N->getValueType(0); if (EltSize > VT.getScalarSizeInBits()) return SDValue(); @@ -16367,21 +13407,11 @@ static SDValue PerformVDUPLANECombine(SDNode *N, } /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. -static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, +static SDValue PerformVDUPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { + SelectionDAG &DAG = DCI.DAG; SDValue Op = N->getOperand(0); - SDLoc dl(N); - - if (Subtarget->hasMVEIntegerOps()) { - // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will - // need to come from a GPR. - if (Op.getValueType() == MVT::f32) - return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), - DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op)); - else if (Op.getValueType() == MVT::f16) - return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), - DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op)); - } if (!Subtarget->hasNEON()) return SDValue(); @@ -16392,12 +13422,12 @@ static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); if (LD && Op.hasOneUse() && LD->isUnindexed() && LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { - SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1), - DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)}; + SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), + DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); - SDValue VLDDup = - DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops, - LD->getMemoryVT(), LD->getMemOperand()); + SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, + Ops, LD->getMemoryVT(), + LD->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); return VLDDup; } @@ -16406,12 +13436,11 @@ static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, } static SDValue PerformLOADCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { + TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); // If this is a legal vector load, try to combine it into a VLD1_UPD. - if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() && + if (ISD::isNormalLoad(N) && VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) return CombineBaseUpdate(N, DCI); @@ -16495,7 +13524,7 @@ static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, ShuffWide, DAG.getIntPtrConstant(I, DL)); SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), - St->getAlign(), St->getMemOperand()->getFlags()); + St->getAlignment(), St->getMemOperand()->getFlags()); BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); Chains.push_back(Ch); @@ -16503,7 +13532,7 @@ static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } -// Try taking a single vector store from an fpround (which would otherwise turn +// Try taking a single vector store from an truncate (which would otherwise turn // into an expensive buildvector) and splitting it into a series of narrowing // stores. static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, @@ -16511,7 +13540,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) return SDValue(); SDValue Trunc = St->getValue(); - if (Trunc->getOpcode() != ISD::FP_ROUND) + if (Trunc->getOpcode() != ISD::TRUNCATE) return SDValue(); EVT FromVT = Trunc->getOperand(0).getValueType(); EVT ToVT = Trunc.getValueType(); @@ -16521,73 +13550,34 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, EVT ToEltVT = ToVT.getVectorElementType(); EVT FromEltVT = FromVT.getVectorElementType(); - if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16) - return SDValue(); - - unsigned NumElements = 4; - if (FromVT.getVectorNumElements() % NumElements != 0) + unsigned NumElements = 0; + if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) + NumElements = 4; + if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) + NumElements = 8; + if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || + FromVT.getVectorNumElements() % NumElements != 0) return SDValue(); - // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so - // use the VMOVN over splitting the store. We are looking for patterns of: - // !rev: 0 N 1 N+1 2 N+2 ... - // rev: N 0 N+1 1 N+2 2 ... - // The shuffle may either be a single source (in which case N = NumElts/2) or - // two inputs extended with concat to the same size (in which case N = - // NumElts). - auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) { - ArrayRef<int> M = SVN->getMask(); - unsigned NumElts = ToVT.getVectorNumElements(); - if (SVN->getOperand(1).isUndef()) - NumElts /= 2; - - unsigned Off0 = Rev ? NumElts : 0; - unsigned Off1 = Rev ? 0 : NumElts; - - for (unsigned I = 0; I < NumElts; I += 2) { - if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2)) - return false; - if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2)) - return false; - } - - return true; - }; - - if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0))) - if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true)) - return SDValue(); - - LLVMContext &C = *DAG.getContext(); SDLoc DL(St); // Details about the old store SDValue Ch = St->getChain(); SDValue BasePtr = St->getBasePtr(); - Align Alignment = St->getOriginalAlign(); + unsigned Alignment = St->getOriginalAlignment(); MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); AAMDNodes AAInfo = St->getAAInfo(); - // We split the store into slices of NumElements. fp16 trunc stores are vcvt - // and then stored as truncating integer stores. - EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements); - EVT NewToVT = EVT::getVectorVT( - C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements); + EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); + EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); SmallVector<SDValue, 4> Stores; for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; - SDValue NewPtr = - DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), DAG.getConstant(i * NumElements, DL, MVT::i32)); - - SDValue FPTrunc = - DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16), - Extract, DAG.getConstant(0, DL, MVT::i32)); - Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc); - SDValue Store = DAG.getTruncStore( Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), NewToVT, Alignment, MMOFlags, AAInfo); @@ -16596,83 +13586,6 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); } -// Try taking a single vector store from an MVETRUNC (which would otherwise turn -// into an expensive buildvector) and splitting it into a series of narrowing -// stores. -static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, - SelectionDAG &DAG) { - if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) - return SDValue(); - SDValue Trunc = St->getValue(); - if (Trunc->getOpcode() != ARMISD::MVETRUNC) - return SDValue(); - EVT FromVT = Trunc->getOperand(0).getValueType(); - EVT ToVT = Trunc.getValueType(); - - LLVMContext &C = *DAG.getContext(); - SDLoc DL(St); - // Details about the old store - SDValue Ch = St->getChain(); - SDValue BasePtr = St->getBasePtr(); - Align Alignment = St->getOriginalAlign(); - MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); - AAMDNodes AAInfo = St->getAAInfo(); - - EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(), - FromVT.getVectorNumElements()); - - SmallVector<SDValue, 4> Stores; - for (unsigned i = 0; i < Trunc.getNumOperands(); i++) { - unsigned NewOffset = - i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8; - SDValue NewPtr = - DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); - - SDValue Extract = Trunc.getOperand(i); - SDValue Store = DAG.getTruncStore( - Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), - NewToVT, Alignment, MMOFlags, AAInfo); - Stores.push_back(Store); - } - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); -} - -// Given a floating point store from an extracted vector, with an integer -// VGETLANE that already exists, store the existing VGETLANEu directly. This can -// help reduce fp register pressure, doesn't require the fp extract and allows -// use of more integer post-inc stores not available with vstr. -static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) { - if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) - return SDValue(); - SDValue Extract = St->getValue(); - EVT VT = Extract.getValueType(); - // For now only uses f16. This may be useful for f32 too, but that will - // be bitcast(extract), not the VGETLANEu we currently check here. - if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); - - SDNode *GetLane = - DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32), - {Extract.getOperand(0), Extract.getOperand(1)}); - if (!GetLane) - return SDValue(); - - LLVMContext &C = *DAG.getContext(); - SDLoc DL(St); - // Create a new integer store to replace the existing floating point version. - SDValue Ch = St->getChain(); - SDValue BasePtr = St->getBasePtr(); - Align Alignment = St->getOriginalAlign(); - MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); - AAMDNodes AAInfo = St->getAAInfo(); - EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits()); - SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr, - St->getPointerInfo(), NewToVT, Alignment, - MMOFlags, AAInfo); - - return Store; -} - /// PerformSTORECombine - Target-specific dag combine xforms for /// ISD::STORE. static SDValue PerformSTORECombine(SDNode *N, @@ -16688,15 +13601,9 @@ static SDValue PerformSTORECombine(SDNode *N, if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) return Store; - if (Subtarget->hasMVEIntegerOps()) { + if (Subtarget->hasMVEIntegerOps()) if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) return NewToken; - if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG)) - return NewChain; - if (SDValue NewToken = - PerformSplittingMVETruncToNarrowingStores(St, DCI.DAG)) - return NewToken; - } if (!ISD::isNormalStore(St)) return SDValue(); @@ -16711,15 +13618,15 @@ static SDValue PerformSTORECombine(SDNode *N, SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore( St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), - BasePtr, St->getPointerInfo(), St->getOriginalAlign(), + BasePtr, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(isBigEndian ? 0 : 1), - OffsetPtr, St->getPointerInfo().getWithOffset(4), - St->getOriginalAlign(), + OffsetPtr, St->getPointerInfo(), + std::min(4U, St->getAlignment() / 2), St->getMemOperand()->getFlags()); } @@ -16743,7 +13650,7 @@ static SDValue PerformSTORECombine(SDNode *N, DCI.AddToWorklist(ExtElt.getNode()); DCI.AddToWorklist(V.getNode()); return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), - St->getPointerInfo(), St->getAlign(), + St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags(), St->getAAInfo()); } @@ -16812,49 +13719,6 @@ static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, return FixConv; } -static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { - if (!Subtarget->hasMVEFloatOps()) - return SDValue(); - - // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x) - // The second form can be more easily turned into a predicated vadd, and - // possibly combined into a fma to become a predicated vfma. - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - EVT VT = N->getValueType(0); - SDLoc DL(N); - - // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set, - // which these VMOV's represent. - auto isIdentitySplat = [&](SDValue Op, bool NSZ) { - if (Op.getOpcode() != ISD::BITCAST || - Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM) - return false; - uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0); - if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ))) - return true; - if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ))) - return true; - return false; - }; - - if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT) - std::swap(Op0, Op1); - - if (Op1.getOpcode() != ISD::VSELECT) - return SDValue(); - - SDNodeFlags FaddFlags = N->getFlags(); - bool NSZ = FaddFlags.hasNoSignedZeros(); - if (!isIdentitySplat(Op1.getOperand(2), NSZ)) - return SDValue(); - - SDValue FAdd = - DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags); - return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags); -} - /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) /// can replace combinations of VCVT (integer to floating-point) and VDIV /// when the VDIV has a constant operand that is a power of 2. @@ -16914,351 +13778,8 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, ConvInput, DAG.getConstant(C, dl, MVT::i32)); } -static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - if (!ST->hasMVEIntegerOps()) - return SDValue(); - - assert(N->getOpcode() == ISD::VECREDUCE_ADD); - EVT ResVT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - SDLoc dl(N); - - // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y) - if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD && - (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 || - N0.getValueType() == MVT::v16i8)) { - SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0)); - SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1)); - return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1); - } - - // We are looking for something that will have illegal types if left alone, - // but that we can convert to a single instruction under MVE. For example - // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A - // or - // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B - - // The legal cases are: - // VADDV u/s 8/16/32 - // VMLAV u/s 8/16/32 - // VADDLV u/s 32 - // VMLALV u/s 16/32 - - // If the input vector is smaller than legal (v4i8/v4i16 for example) we can - // extend it and use v4i32 instead. - auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) { - EVT AVT = A.getValueType(); - return any_of(ExtTypes, [&](MVT Ty) { - return AVT.getVectorNumElements() == Ty.getVectorNumElements() && - AVT.bitsLE(Ty); - }); - }; - auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) { - EVT AVT = A.getValueType(); - if (!AVT.is128BitVector()) - A = DAG.getNode(ExtendCode, dl, - AVT.changeVectorElementType(MVT::getIntegerVT( - 128 / AVT.getVectorMinNumElements())), - A); - return A; - }; - auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) { - if (ResVT != RetTy || N0->getOpcode() != ExtendCode) - return SDValue(); - SDValue A = N0->getOperand(0); - if (ExtTypeMatches(A, ExtTypes)) - return ExtendIfNeeded(A, ExtendCode); - return SDValue(); - }; - auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode, - ArrayRef<MVT> ExtTypes, SDValue &Mask) { - if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || - !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) - return SDValue(); - Mask = N0->getOperand(0); - SDValue Ext = N0->getOperand(1); - if (Ext->getOpcode() != ExtendCode) - return SDValue(); - SDValue A = Ext->getOperand(0); - if (ExtTypeMatches(A, ExtTypes)) - return ExtendIfNeeded(A, ExtendCode); - return SDValue(); - }; - auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, - SDValue &A, SDValue &B) { - // For a vmla we are trying to match a larger pattern: - // ExtA = sext/zext A - // ExtB = sext/zext B - // Mul = mul ExtA, ExtB - // vecreduce.add Mul - // There might also be en extra extend between the mul and the addreduce, so - // long as the bitwidth is high enough to make them equivalent (for example - // original v8i16 might be mul at v8i32 and the reduce happens at v8i64). - if (ResVT != RetTy) - return false; - SDValue Mul = N0; - if (Mul->getOpcode() == ExtendCode && - Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= - ResVT.getScalarSizeInBits()) - Mul = Mul->getOperand(0); - if (Mul->getOpcode() != ISD::MUL) - return false; - SDValue ExtA = Mul->getOperand(0); - SDValue ExtB = Mul->getOperand(1); - if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode) - return false; - A = ExtA->getOperand(0); - B = ExtB->getOperand(0); - if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) { - A = ExtendIfNeeded(A, ExtendCode); - B = ExtendIfNeeded(B, ExtendCode); - return true; - } - return false; - }; - auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, - SDValue &A, SDValue &B, SDValue &Mask) { - // Same as the pattern above with a select for the zero predicated lanes - // ExtA = sext/zext A - // ExtB = sext/zext B - // Mul = mul ExtA, ExtB - // N0 = select Mask, Mul, 0 - // vecreduce.add N0 - if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || - !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) - return false; - Mask = N0->getOperand(0); - SDValue Mul = N0->getOperand(1); - if (Mul->getOpcode() == ExtendCode && - Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= - ResVT.getScalarSizeInBits()) - Mul = Mul->getOperand(0); - if (Mul->getOpcode() != ISD::MUL) - return false; - SDValue ExtA = Mul->getOperand(0); - SDValue ExtB = Mul->getOperand(1); - if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode) - return false; - A = ExtA->getOperand(0); - B = ExtB->getOperand(0); - if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) { - A = ExtendIfNeeded(A, ExtendCode); - B = ExtendIfNeeded(B, ExtendCode); - return true; - } - return false; - }; - auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) { - // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64 - // reductions. The operands are extended with MVEEXT, but as they are - // reductions the lane orders do not matter. MVEEXT may be combined with - // loads to produce two extending loads, or else they will be expanded to - // VREV/VMOVL. - EVT VT = Ops[0].getValueType(); - if (VT == MVT::v16i8) { - assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) && - "Unexpected illegal long reduction opcode"); - bool IsUnsigned = Opcode == ARMISD::VMLALVu; - - SDValue Ext0 = - DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl, - DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]); - SDValue Ext1 = - DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl, - DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]); - - SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), - Ext0, Ext1); - SDValue MLA1 = - DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl, - DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1), - Ext0.getValue(1), Ext1.getValue(1)); - return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1)); - } - SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops); - return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node, - SDValue(Node.getNode(), 1)); - }; - - SDValue A, B; - SDValue Mask; - if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) - return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B); - if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) - return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B); - if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32}, - A, B)) - return Create64bitNode(ARMISD::VMLALVs, {A, B}); - if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32}, - A, B)) - return Create64bitNode(ARMISD::VMLALVu, {A, B}); - if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B)); - if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B)); - - if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, - Mask)) - return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask); - if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, - Mask)) - return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask); - if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B, - Mask)) - return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask}); - if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B, - Mask)) - return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask}); - if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask)); - if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask)); - - if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8})) - return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A); - if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8})) - return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A); - if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32})) - return Create64bitNode(ARMISD::VADDLVs, {A}); - if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32})) - return Create64bitNode(ARMISD::VADDLVu, {A}); - if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8})) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A)); - if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8})) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A)); - - if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) - return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask); - if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) - return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask); - if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask)) - return Create64bitNode(ARMISD::VADDLVps, {A, Mask}); - if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask)) - return Create64bitNode(ARMISD::VADDLVpu, {A, Mask}); - if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask)); - if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask)); - - // Some complications. We can get a case where the two inputs of the mul are - // the same, then the output sext will have been helpfully converted to a - // zext. Turn it back. - SDValue Op = N0; - if (Op->getOpcode() == ISD::VSELECT) - Op = Op->getOperand(1); - if (Op->getOpcode() == ISD::ZERO_EXTEND && - Op->getOperand(0)->getOpcode() == ISD::MUL) { - SDValue Mul = Op->getOperand(0); - if (Mul->getOperand(0) == Mul->getOperand(1) && - Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) { - SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul); - if (Op != N0) - Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0), - N0->getOperand(0), Ext, N0->getOperand(2)); - return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext); - } - } - - return SDValue(); -} - -static SDValue PerformVMOVNCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - unsigned IsTop = N->getConstantOperandVal(2); - - // VMOVNT a undef -> a - // VMOVNB a undef -> a - // VMOVNB undef a -> a - if (Op1->isUndef()) - return Op0; - if (Op0->isUndef() && !IsTop) - return Op1; - - // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b) - // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b) - if ((Op1->getOpcode() == ARMISD::VQMOVNs || - Op1->getOpcode() == ARMISD::VQMOVNu) && - Op1->getConstantOperandVal(2) == 0) - return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0), - Op0, Op1->getOperand(1), N->getOperand(2)); - - // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from - // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting - // into the top or bottom lanes. - unsigned NumElts = N->getValueType(0).getVectorNumElements(); - APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1)); - APInt Op0DemandedElts = - IsTop ? Op1DemandedElts - : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1)); - - const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI)) - return SDValue(N, 0); - if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI)) - return SDValue(N, 0); - - return SDValue(); -} - -static SDValue PerformVQMOVNCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - SDValue Op0 = N->getOperand(0); - unsigned IsTop = N->getConstantOperandVal(2); - - unsigned NumElts = N->getValueType(0).getVectorNumElements(); - APInt Op0DemandedElts = - APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) - : APInt::getHighBitsSet(2, 1)); - - const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI)) - return SDValue(N, 0); - return SDValue(); -} - -static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) { - SDLoc DL(N); - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - - // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from - // uses of the intrinsics. - if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) { - int ShiftAmt = C->getSExtValue(); - if (ShiftAmt == 0) { - SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL); - DAG.ReplaceAllUsesWith(N, Merge.getNode()); - return SDValue(); - } - - if (ShiftAmt >= -32 && ShiftAmt < 0) { - unsigned NewOpcode = - N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL; - SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1, - DAG.getConstant(-ShiftAmt, DL, MVT::i32)); - DAG.ReplaceAllUsesWith(N, NewShift.getNode()); - return NewShift; - } - } - - return SDValue(); -} - /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. -SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; +static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); switch (IntNo) { default: @@ -17407,72 +13928,6 @@ SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N, case Intrinsic::arm_neon_vqrshiftu: // No immediate versions of these to check for. break; - - case Intrinsic::arm_mve_vqdmlah: - case Intrinsic::arm_mve_vqdmlash: - case Intrinsic::arm_mve_vqrdmlah: - case Intrinsic::arm_mve_vqrdmlash: - case Intrinsic::arm_mve_vmla_n_predicated: - case Intrinsic::arm_mve_vmlas_n_predicated: - case Intrinsic::arm_mve_vqdmlah_predicated: - case Intrinsic::arm_mve_vqdmlash_predicated: - case Intrinsic::arm_mve_vqrdmlah_predicated: - case Intrinsic::arm_mve_vqrdmlash_predicated: { - // These intrinsics all take an i32 scalar operand which is narrowed to the - // size of a single lane of the vector type they return. So we don't need - // any bits of that operand above that point, which allows us to eliminate - // uxth/sxth. - unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); - APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); - if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI)) - return SDValue(); - break; - } - - case Intrinsic::arm_mve_minv: - case Intrinsic::arm_mve_maxv: - case Intrinsic::arm_mve_minav: - case Intrinsic::arm_mve_maxav: - case Intrinsic::arm_mve_minv_predicated: - case Intrinsic::arm_mve_maxv_predicated: - case Intrinsic::arm_mve_minav_predicated: - case Intrinsic::arm_mve_maxav_predicated: { - // These intrinsics all take an i32 scalar operand which is narrowed to the - // size of a single lane of the vector type they take as the other input. - unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits(); - APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); - if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) - return SDValue(); - break; - } - - case Intrinsic::arm_mve_addv: { - // Turn this intrinsic straight into the appropriate ARMISD::VADDV node, - // which allow PerformADDVecReduce to turn it into VADDLV when possible. - bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); - unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs; - return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1)); - } - - case Intrinsic::arm_mve_addlv: - case Intrinsic::arm_mve_addlv_predicated: { - // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR - // which recombines the two outputs into an i64 - bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); - unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ? - (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) : - (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps); - - SmallVector<SDValue, 4> Ops; - for (unsigned i = 1, e = N->getNumOperands(); i < e; i++) - if (i != 2) // skip the unsigned flag - Ops.push_back(N->getOperand(i)); - - SDLoc dl(N); - SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops); - return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0), - val.getValue(1)); - } } return SDValue(); @@ -17488,6 +13943,18 @@ static SDValue PerformShiftCombine(SDNode *N, const ARMSubtarget *ST) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); + if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { + // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high + // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. + SDValue N1 = N->getOperand(1); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { + SDValue N0 = N->getOperand(0); + if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && + DAG.MaskedValueIsZero(N0.getOperand(0), + APInt::getHighBitsSet(32, 16))) + return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); + } + } if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && N->getOperand(0)->getOpcode() == ISD::AND && @@ -17527,7 +13994,7 @@ static SDValue PerformShiftCombine(SDNode *N, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!VT.isVector() || !TLI.isTypeLegal(VT)) return SDValue(); - if (ST->hasMVEIntegerOps()) + if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) return SDValue(); int64_t Cnt; @@ -17556,10 +14023,9 @@ static SDValue PerformShiftCombine(SDNode *N, return SDValue(); } -// Look for a sign/zero/fpextend extend of a larger than legal load. This can be -// split into multiple extending loads, which are simpler to deal with than an -// arbitrary extend. For fp extends we use an integer extending load and a VCVTL -// to convert the type to an f32. +// Look for a sign/zero extend of a larger than legal load. This can be split +// into two extending loads, which are simpler to deal with than an arbitrary +// sign extend. static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::LOAD) @@ -17577,66 +14043,49 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { EVT FromEltVT = FromVT.getVectorElementType(); unsigned NumElements = 0; - if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8) - NumElements = 4; - if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16) + if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) NumElements = 4; + if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) + NumElements = 8; if (NumElements == 0 || - (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) || + FromVT.getVectorNumElements() == NumElements || FromVT.getVectorNumElements() % NumElements != 0 || !isPowerOf2_32(NumElements)) return SDValue(); - LLVMContext &C = *DAG.getContext(); SDLoc DL(LD); // Details about the old load SDValue Ch = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); - Align Alignment = LD->getOriginalAlign(); + unsigned Alignment = LD->getOriginalAlignment(); MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); AAMDNodes AAInfo = LD->getAAInfo(); ISD::LoadExtType NewExtType = N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); - EVT NewFromVT = EVT::getVectorVT( - C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); - EVT NewToVT = EVT::getVectorVT( - C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); - - SmallVector<SDValue, 4> Loads; - SmallVector<SDValue, 4> Chains; - for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { - unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; - SDValue NewPtr = - DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); - - SDValue NewLoad = - DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, - LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, - Alignment, MMOFlags, AAInfo); - Loads.push_back(NewLoad); - Chains.push_back(SDValue(NewLoad.getNode(), 1)); - } - - // Float truncs need to extended with VCVTB's into their floating point types. - if (FromEltVT == MVT::f16) { - SmallVector<SDValue, 4> Extends; - - for (unsigned i = 0; i < Loads.size(); i++) { - SDValue LoadBC = - DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]); - SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC, - DAG.getConstant(0, DL, MVT::i32)); - Extends.push_back(FPExt); - } - - Loads = Extends; - } - - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); + EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); + unsigned NewOffset = NewFromVT.getSizeInBits() / 8; + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); + + // Split the load in half, each side of which is extended separately. This + // is good enough, as legalisation will take it from there. They are either + // already legal or they will be split further into something that is + // legal. + SDValue NewLoad1 = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, + LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo); + SDValue NewLoad2 = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, + LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, + Alignment, MMOFlags, AAInfo); + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(NewLoad1.getNode(), 1), + SDValue(NewLoad2.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); } /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, @@ -17684,164 +14133,6 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - if (ST->hasMVEFloatOps()) - if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) - return NewLoad; - - return SDValue(); -} - -// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating -// constant bounds. -static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { - if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) && - !Subtarget->isThumb2()) - return SDValue(); - - EVT VT = Op.getValueType(); - SDValue Op0 = Op.getOperand(0); - - if (VT != MVT::i32 || - (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) || - !isa<ConstantSDNode>(Op.getOperand(1)) || - !isa<ConstantSDNode>(Op0.getOperand(1))) - return SDValue(); - - SDValue Min = Op; - SDValue Max = Op0; - SDValue Input = Op0.getOperand(0); - if (Min.getOpcode() == ISD::SMAX) - std::swap(Min, Max); - - APInt MinC = Min.getConstantOperandAPInt(1); - APInt MaxC = Max.getConstantOperandAPInt(1); - - if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX || - !(MinC + 1).isPowerOf2()) - return SDValue(); - - SDLoc DL(Op); - if (MinC == ~MaxC) - return DAG.getNode(ARMISD::SSAT, DL, VT, Input, - DAG.getConstant(MinC.countTrailingOnes(), DL, VT)); - if (MaxC == 0) - return DAG.getNode(ARMISD::USAT, DL, VT, Input, - DAG.getConstant(MinC.countTrailingOnes(), DL, VT)); - - return SDValue(); -} - -/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating -/// saturates. -static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - - if (VT == MVT::i32) - return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST); - - if (!ST->hasMVEIntegerOps()) - return SDValue(); - - if (SDValue V = PerformVQDMULHCombine(N, DAG)) - return V; - - if (VT != MVT::v4i32 && VT != MVT::v8i16) - return SDValue(); - - auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) { - // Check one is a smin and the other is a smax - if (Min->getOpcode() != ISD::SMIN) - std::swap(Min, Max); - if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX) - return false; - - APInt SaturateC; - if (VT == MVT::v4i32) - SaturateC = APInt(32, (1 << 15) - 1, true); - else //if (VT == MVT::v8i16) - SaturateC = APInt(16, (1 << 7) - 1, true); - - APInt MinC, MaxC; - if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || - MinC != SaturateC) - return false; - if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) || - MaxC != ~SaturateC) - return false; - return true; - }; - - if (IsSignedSaturate(N, N0.getNode())) { - SDLoc DL(N); - MVT ExtVT, HalfVT; - if (VT == MVT::v4i32) { - HalfVT = MVT::v8i16; - ExtVT = MVT::v4i16; - } else { // if (VT == MVT::v8i16) - HalfVT = MVT::v16i8; - ExtVT = MVT::v8i8; - } - - // Create a VQMOVNB with undef top lanes, then signed extended into the top - // half. That extend will hopefully be removed if only the bottom bits are - // demanded (though a truncating store, for example). - SDValue VQMOVN = - DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT), - N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32)); - SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); - return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast, - DAG.getValueType(ExtVT)); - } - - auto IsUnsignedSaturate = [&](SDNode *Min) { - // For unsigned, we just need to check for <= 0xffff - if (Min->getOpcode() != ISD::UMIN) - return false; - - APInt SaturateC; - if (VT == MVT::v4i32) - SaturateC = APInt(32, (1 << 16) - 1, true); - else //if (VT == MVT::v8i16) - SaturateC = APInt(16, (1 << 8) - 1, true); - - APInt MinC; - if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || - MinC != SaturateC) - return false; - return true; - }; - - if (IsUnsignedSaturate(N)) { - SDLoc DL(N); - MVT HalfVT; - unsigned ExtConst; - if (VT == MVT::v4i32) { - HalfVT = MVT::v8i16; - ExtConst = 0x0000FFFF; - } else { //if (VT == MVT::v8i16) - HalfVT = MVT::v16i8; - ExtConst = 0x00FF; - } - - // Create a VQMOVNB with undef top lanes, then ZExt into the top half with - // an AND. That extend will hopefully be removed if only the bottom bits are - // demanded (though a truncating store, for example). - SDValue VQMOVN = - DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0, - DAG.getConstant(0, DL, MVT::i32)); - SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); - return DAG.getNode(ISD::AND, DL, VT, Bitcast, - DAG.getConstant(ExtConst, DL, VT)); - } - - return SDValue(); -} - static const APInt *isPowerOf2Constant(SDValue V) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); if (!C) @@ -17963,7 +14254,7 @@ static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); if (!Const) return SDValue(); - if (Const->isZero()) + if (Const->isNullValue()) Imm = 0; else if (Const->isOne()) Imm = 1; @@ -17974,7 +14265,7 @@ static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, } case ISD::INTRINSIC_W_CHAIN: { unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); - if (IntOp != Intrinsic::test_start_loop_iterations && + if (IntOp != Intrinsic::test_set_loop_iterations && IntOp != Intrinsic::loop_decrement_reg) return SDValue(); return N; @@ -17989,7 +14280,7 @@ static SDValue PerformHWLoopCombine(SDNode *N, // The hwloop intrinsics that we're interested are used for control-flow, // either for entering or exiting the loop: - // - test.start.loop.iterations will test whether its operand is zero. If it + // - test.set.loop.iterations will test whether its operand is zero. If it // is zero, the proceeding branch should not enter the loop. // - loop.decrement.reg also tests whether its operand is zero. If it is // zero, the proceeding branch should not branch back to the beginning of @@ -18015,7 +14306,7 @@ static SDValue PerformHWLoopCombine(SDNode *N, Cond = N->getOperand(2); Dest = N->getOperand(4); if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { - if (!Const->isOne() && !Const->isZero()) + if (!Const->isOne() && !Const->isNullValue()) return SDValue(); Imm = Const->getZExtValue(); } else @@ -18064,25 +14355,21 @@ static SDValue PerformHWLoopCombine(SDNode *N, DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); }; - if (IntOp == Intrinsic::test_start_loop_iterations) { + if (IntOp == Intrinsic::test_set_loop_iterations) { SDValue Res; - SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements); // We expect this 'instruction' to branch when the counter is zero. if (IsTrueIfZero(CC, Imm)) { - SDValue Ops[] = {Chain, Setup, Dest}; + SDValue Ops[] = { Chain, Elements, Dest }; Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); } else { // The logic is the reverse of what we need for WLS, so find the other // basic block target: the target of the proceeding br. UpdateUncondBr(Br, Dest, DAG); - SDValue Ops[] = {Chain, Setup, OtherTarget}; + SDValue Ops[] = { Chain, Elements, OtherTarget }; Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); } - // Update LR count to the new value - DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup); - // Update chain - DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0)); + DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); return Res; } else { SDValue Size = DAG.getTargetConstant( @@ -18220,23 +14507,6 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { if (!VT.isInteger()) return SDValue(); - // Fold away an unneccessary CMPZ/CMOV - // CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) -> - // if C1==EQ -> CMOV A, B, C2, $cpsr, D - // if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D - if (N->getConstantOperandVal(2) == ARMCC::EQ || - N->getConstantOperandVal(2) == ARMCC::NE) { - ARMCC::CondCodes Cond; - if (SDValue C = IsCMPZCSINC(N->getOperand(4).getNode(), Cond)) { - if (N->getConstantOperandVal(2) == ARMCC::NE) - Cond = ARMCC::getOppositeCondition(Cond); - return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), - N->getOperand(1), - DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32), - N->getOperand(3), C); - } - } - // Materialize a boolean comparison for integers so we can avoid branching. if (isNullConstant(FalseVal)) { if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { @@ -18344,325 +14614,10 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { return Res; } -static SDValue PerformBITCASTCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *ST) { - SelectionDAG &DAG = DCI.DAG; - SDValue Src = N->getOperand(0); - EVT DstVT = N->getValueType(0); - - // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE. - if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) { - EVT SrcVT = Src.getValueType(); - if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits()) - return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0)); - } - - // We may have a bitcast of something that has already had this bitcast - // combine performed on it, so skip past any VECTOR_REG_CASTs. - while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST) - Src = Src.getOperand(0); - - // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that - // would be generated is at least the width of the element type. - EVT SrcVT = Src.getValueType(); - if ((Src.getOpcode() == ARMISD::VMOVIMM || - Src.getOpcode() == ARMISD::VMVNIMM || - Src.getOpcode() == ARMISD::VMOVFPIMM) && - SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() && - DAG.getDataLayout().isBigEndian()) - return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src); - - // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x - if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) - return R; - - return SDValue(); -} - -// Some combines for the MVETrunc truncations legalizer helper. Also lowers the -// node into stack operations after legalizeOps. -SDValue ARMTargetLowering::PerformMVETruncCombine( - SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - EVT VT = N->getValueType(0); - SDLoc DL(N); - - // MVETrunc(Undef, Undef) -> Undef - if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); })) - return DAG.getUNDEF(VT); - - // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc - if (N->getNumOperands() == 2 && - N->getOperand(0).getOpcode() == ARMISD::MVETRUNC && - N->getOperand(1).getOpcode() == ARMISD::MVETRUNC) - return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0), - N->getOperand(0).getOperand(1), - N->getOperand(1).getOperand(0), - N->getOperand(1).getOperand(1)); - - // MVETrunc(shuffle, shuffle) -> VMOVN - if (N->getNumOperands() == 2 && - N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && - N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) { - auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode()); - auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode()); - - if (S0->getOperand(0) == S1->getOperand(0) && - S0->getOperand(1) == S1->getOperand(1)) { - // Construct complete shuffle mask - SmallVector<int, 8> Mask(S0->getMask()); - Mask.append(S1->getMask().begin(), S1->getMask().end()); - - if (isVMOVNTruncMask(Mask, VT, false)) - return DAG.getNode( - ARMISD::VMOVN, DL, VT, - DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)), - DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), - DAG.getConstant(1, DL, MVT::i32)); - if (isVMOVNTruncMask(Mask, VT, true)) - return DAG.getNode( - ARMISD::VMOVN, DL, VT, - DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), - DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)), - DAG.getConstant(1, DL, MVT::i32)); - } - } - - // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the - // truncate to a buildvector to allow the generic optimisations to kick in. - if (all_of(N->ops(), [](SDValue Op) { - return Op.getOpcode() == ISD::BUILD_VECTOR || - Op.getOpcode() == ISD::VECTOR_SHUFFLE || - (Op.getOpcode() == ISD::BITCAST && - Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR); - })) { - SmallVector<SDValue, 8> Extracts; - for (unsigned Op = 0; Op < N->getNumOperands(); Op++) { - SDValue O = N->getOperand(Op); - for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) { - SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O, - DAG.getConstant(i, DL, MVT::i32)); - Extracts.push_back(Ext); - } - } - return DAG.getBuildVector(VT, DL, Extracts); - } - - // If we are late in the legalization process and nothing has optimised - // the trunc to anything better, lower it to a stack store and reload, - // performing the truncation whilst keeping the lanes in the correct order: - // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack; - if (!DCI.isAfterLegalizeDAG()) - return SDValue(); - - SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4)); - int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); - int NumIns = N->getNumOperands(); - assert((NumIns == 2 || NumIns == 4) && - "Expected 2 or 4 inputs to an MVETrunc"); - EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); - if (N->getNumOperands() == 4) - StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext()); - - SmallVector<SDValue> Chains; - for (int I = 0; I < NumIns; I++) { - SDValue Ptr = DAG.getNode( - ISD::ADD, DL, StackPtr.getValueType(), StackPtr, - DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType())); - MachinePointerInfo MPI = MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), SPFI, I * 16 / NumIns); - SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I), - Ptr, MPI, StoreVT, Align(4)); - Chains.push_back(Ch); - } - - SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - MachinePointerInfo MPI = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0); - return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4)); -} - -// Take a MVEEXT(load x) and split that into (extload x, extload x+8) -static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, - SelectionDAG &DAG) { - SDValue N0 = N->getOperand(0); - LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode()); - if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed()) - return SDValue(); - - EVT FromVT = LD->getMemoryVT(); - EVT ToVT = N->getValueType(0); - if (!ToVT.isVector()) - return SDValue(); - assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2); - EVT ToEltVT = ToVT.getVectorElementType(); - EVT FromEltVT = FromVT.getVectorElementType(); - - unsigned NumElements = 0; - if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) - NumElements = 4; - if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) - NumElements = 8; - assert(NumElements != 0); - - ISD::LoadExtType NewExtType = - N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD; - if (LD->getExtensionType() != ISD::NON_EXTLOAD && - LD->getExtensionType() != ISD::EXTLOAD && - LD->getExtensionType() != NewExtType) - return SDValue(); - - LLVMContext &C = *DAG.getContext(); - SDLoc DL(LD); - // Details about the old load - SDValue Ch = LD->getChain(); - SDValue BasePtr = LD->getBasePtr(); - Align Alignment = LD->getOriginalAlign(); - MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); - AAMDNodes AAInfo = LD->getAAInfo(); - - SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); - EVT NewFromVT = EVT::getVectorVT( - C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); - EVT NewToVT = EVT::getVectorVT( - C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); - - SmallVector<SDValue, 4> Loads; - SmallVector<SDValue, 4> Chains; - for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { - unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; - SDValue NewPtr = - DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); - - SDValue NewLoad = - DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, - LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, - Alignment, MMOFlags, AAInfo); - Loads.push_back(NewLoad); - Chains.push_back(SDValue(NewLoad.getNode(), 1)); - } - - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); - return DAG.getMergeValues(Loads, DL); -} - -// Perform combines for MVEEXT. If it has not be optimized to anything better -// before lowering, it gets converted to stack store and extloads performing the -// extend whilst still keeping the same lane ordering. -SDValue ARMTargetLowering::PerformMVEExtCombine( - SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - EVT VT = N->getValueType(0); - SDLoc DL(N); - assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements"); - assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type"); - - EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT( - *DAG.getContext()); - auto Extend = [&](SDValue V) { - SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V); - return N->getOpcode() == ARMISD::MVESEXT - ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT, - DAG.getValueType(ExtVT)) - : DAG.getZeroExtendInReg(VVT, DL, ExtVT); - }; - - // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP) - if (N->getOperand(0).getOpcode() == ARMISD::VDUP) { - SDValue Ext = Extend(N->getOperand(0)); - return DAG.getMergeValues({Ext, Ext}, DL); - } - - // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG - if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) { - ArrayRef<int> Mask = SVN->getMask(); - assert(Mask.size() == 2 * VT.getVectorNumElements()); - assert(Mask.size() == SVN->getValueType(0).getVectorNumElements()); - unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16; - SDValue Op0 = SVN->getOperand(0); - SDValue Op1 = SVN->getOperand(1); - - auto CheckInregMask = [&](int Start, int Offset) { - for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx) - if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset) - return false; - return true; - }; - SDValue V0 = SDValue(N, 0); - SDValue V1 = SDValue(N, 1); - if (CheckInregMask(0, 0)) - V0 = Extend(Op0); - else if (CheckInregMask(0, 1)) - V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0)); - else if (CheckInregMask(0, Mask.size())) - V0 = Extend(Op1); - else if (CheckInregMask(0, Mask.size() + 1)) - V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1)); - - if (CheckInregMask(VT.getVectorNumElements(), Mask.size())) - V1 = Extend(Op1); - else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1)) - V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1)); - else if (CheckInregMask(VT.getVectorNumElements(), 0)) - V1 = Extend(Op0); - else if (CheckInregMask(VT.getVectorNumElements(), 1)) - V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0)); - - if (V0.getNode() != N || V1.getNode() != N) - return DAG.getMergeValues({V0, V1}, DL); - } - - // MVEEXT(load) -> extload, extload - if (N->getOperand(0)->getOpcode() == ISD::LOAD) - if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG)) - return L; - - if (!DCI.isAfterLegalizeDAG()) - return SDValue(); - - // Lower to a stack store and reload: - // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8; - SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4)); - int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); - int NumOuts = N->getNumValues(); - assert((NumOuts == 2 || NumOuts == 4) && - "Expected 2 or 4 outputs to an MVEEXT"); - EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT( - *DAG.getContext()); - if (N->getNumOperands() == 4) - LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext()); - - MachinePointerInfo MPI = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0); - SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0), - StackPtr, MPI, Align(4)); - - SmallVector<SDValue> Loads; - for (int I = 0; I < NumOuts; I++) { - SDValue Ptr = DAG.getNode( - ISD::ADD, DL, StackPtr.getValueType(), StackPtr, - DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType())); - MachinePointerInfo MPI = MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), SPFI, I * 16 / NumOuts); - SDValue Load = DAG.getExtLoad( - N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, - VT, Chain, Ptr, MPI, LoadVT, Align(4)); - Loads.push_back(Load); - } - - return DAG.getMergeValues(Loads, DL); -} - SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; - case ISD::SELECT_CC: - case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget); - case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); - case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget); case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); @@ -18677,57 +14632,31 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ARMISD::ADDC: case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); - case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG); + case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); - case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI); - case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG); case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); - case ISD::EXTRACT_VECTOR_ELT: - return PerformExtractEltCombine(N, DCI, Subtarget); - case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG); - case ISD::INSERT_SUBVECTOR: return PerformInsertSubvectorCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); - case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget); - case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget); + case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); + case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI.DAG, Subtarget); - case ISD::FADD: - return PerformFAddVSelectCombine(N, DCI.DAG, Subtarget); case ISD::FDIV: return PerformVDIVCombine(N, DCI.DAG, Subtarget); - case ISD::INTRINSIC_WO_CHAIN: - return PerformIntrinsicCombine(N, DCI); + case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DCI, Subtarget); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: - case ISD::ANY_EXTEND: - return PerformExtendCombine(N, DCI.DAG, Subtarget); - case ISD::FP_EXTEND: - return PerformFPExtendCombine(N, DCI.DAG, Subtarget); - case ISD::SMIN: - case ISD::UMIN: - case ISD::SMAX: - case ISD::UMAX: - return PerformMinMaxCombine(N, DCI.DAG, Subtarget); - case ARMISD::CMOV: - return PerformCMOVCombine(N, DCI.DAG); - case ARMISD::BRCOND: - return PerformBRCONDCombine(N, DCI.DAG); - case ARMISD::CMPZ: - return PerformCMPZCombine(N, DCI.DAG); - case ARMISD::CSINC: - case ARMISD::CSINV: - case ARMISD::CSNEG: - return PerformCSETCombine(N, DCI.DAG); - case ISD::LOAD: - return PerformLOADCombine(N, DCI, Subtarget); + case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); + case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); + case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); + case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD1DUP: case ARMISD::VLD2DUP: case ARMISD::VLD3DUP: @@ -18735,30 +14664,10 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); - case ISD::BITCAST: - return PerformBITCASTCombine(N, DCI, Subtarget); case ARMISD::PREDICATE_CAST: return PerformPREDICATE_CASTCombine(N, DCI); - case ARMISD::VECTOR_REG_CAST: - return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget); - case ARMISD::MVETRUNC: - return PerformMVETruncCombine(N, DCI); - case ARMISD::MVESEXT: - case ARMISD::MVEZEXT: - return PerformMVEExtCombine(N, DCI); case ARMISD::VCMP: - return PerformVCMPCombine(N, DCI.DAG, Subtarget); - case ISD::VECREDUCE_ADD: - return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget); - case ARMISD::VMOVN: - return PerformVMOVNCombine(N, DCI); - case ARMISD::VQMOVNs: - case ARMISD::VQMOVNu: - return PerformVQMOVNCombine(N, DCI); - case ARMISD::ASRL: - case ARMISD::LSRL: - case ARMISD::LSLL: - return PerformLongShiftCombine(N, DCI.DAG); + return PerformVCMPCombine(N, DCI, Subtarget); case ARMISD::SMULWB: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); @@ -18775,9 +14684,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, } case ARMISD::SMLALBB: case ARMISD::QADD16b: - case ARMISD::QSUB16b: - case ARMISD::UQADD16b: - case ARMISD::UQSUB16b: { + case ARMISD::QSUB16b: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || @@ -18814,9 +14721,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, break; } case ARMISD::QADD8b: - case ARMISD::QSUB8b: - case ARMISD::UQADD8b: - case ARMISD::UQSUB8b: { + case ARMISD::QSUB8b: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || @@ -18851,11 +14756,6 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: return PerformVLDCombine(N, DCI); - case Intrinsic::arm_mve_vld2q: - case Intrinsic::arm_mve_vld4q: - case Intrinsic::arm_mve_vst2q: - case Intrinsic::arm_mve_vst4q: - return PerformMVEVLDCombine(N, DCI); default: break; } break; @@ -18869,9 +14769,9 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, } bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, - Align Alignment, + unsigned Alignment, MachineMemOperand::Flags, - unsigned *Fast) const { + bool *Fast) const { // Depends what it gets converted into if the type is weird. if (!VT.isSimple()) return false; @@ -18895,7 +14795,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, // A big-endian target may also explicitly support unaligned accesses if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { if (Fast) - *Fast = 1; + *Fast = true; return true; } } @@ -18904,10 +14804,9 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, return false; // These are for predicates - if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 || - Ty == MVT::v2i1)) { + if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { if (Fast) - *Fast = 1; + *Fast = true; return true; } @@ -18933,30 +14832,37 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || Ty == MVT::v2f64) { if (Fast) - *Fast = 1; + *Fast = true; return true; } return false; } +static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, + unsigned AlignCheck) { + return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && + (DstAlign == 0 || DstAlign % AlignCheck == 0)); +} EVT ARMTargetLowering::getOptimalMemOpType( - const MemOp &Op, const AttributeList &FuncAttributes) const { + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { // See if we can use NEON instructions for this... - if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && - !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { - unsigned Fast; - if (Op.size() >= 16 && - (Op.isAligned(Align(16)) || - (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1), + if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { + bool Fast; + if (Size >= 16 && + (memOpAlign(SrcAlign, DstAlign, 16) || + (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, MachineMemOperand::MONone, &Fast) && Fast))) { return MVT::v2f64; - } else if (Op.size() >= 8 && - (Op.isAligned(Align(8)) || + } else if (Size >= 8 && + (memOpAlign(SrcAlign, DstAlign, 8) || (allowsMisalignedMemoryAccesses( - MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) && + MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && Fast))) { return MVT::f64; } @@ -19068,119 +14974,45 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I, if (!Subtarget->hasMVEIntegerOps()) return false; - auto IsFMSMul = [&](Instruction *I) { - if (!I->hasOneUse()) - return false; - auto *Sub = cast<Instruction>(*I->users().begin()); - return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; - }; - auto IsFMS = [&](Instruction *I) { - if (match(I->getOperand(0), m_FNeg(m_Value())) || - match(I->getOperand(1), m_FNeg(m_Value()))) - return true; - return false; - }; - - auto IsSinker = [&](Instruction *I, int Operand) { + auto IsSinker = [](Instruction *I, int Operand) { switch (I->getOpcode()) { case Instruction::Add: case Instruction::Mul: - case Instruction::FAdd: case Instruction::ICmp: - case Instruction::FCmp: return true; - case Instruction::FMul: - return !IsFMSMul(I); case Instruction::Sub: - case Instruction::FSub: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: return Operand == 1; - case Instruction::Call: - if (auto *II = dyn_cast<IntrinsicInst>(I)) { - switch (II->getIntrinsicID()) { - case Intrinsic::fma: - return !IsFMS(I); - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::arm_mve_add_predicated: - case Intrinsic::arm_mve_mul_predicated: - case Intrinsic::arm_mve_qadd_predicated: - case Intrinsic::arm_mve_vhadd: - case Intrinsic::arm_mve_hadd_predicated: - case Intrinsic::arm_mve_vqdmull: - case Intrinsic::arm_mve_vqdmull_predicated: - case Intrinsic::arm_mve_vqdmulh: - case Intrinsic::arm_mve_qdmulh_predicated: - case Intrinsic::arm_mve_vqrdmulh: - case Intrinsic::arm_mve_qrdmulh_predicated: - case Intrinsic::arm_mve_fma_predicated: - return true; - case Intrinsic::ssub_sat: - case Intrinsic::usub_sat: - case Intrinsic::arm_mve_sub_predicated: - case Intrinsic::arm_mve_qsub_predicated: - case Intrinsic::arm_mve_hsub_predicated: - case Intrinsic::arm_mve_vhsub: - return Operand == 1; - default: - return false; - } - } - return false; default: return false; } }; - for (auto OpIdx : enumerate(I->operands())) { - Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get()); - // Make sure we are not already sinking this operand - if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) - continue; - - Instruction *Shuffle = Op; - if (Shuffle->getOpcode() == Instruction::BitCast) - Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0)); - // We are looking for a splat that can be sunk. - if (!Shuffle || - !match(Shuffle, m_Shuffle( - m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), - m_Undef(), m_ZeroMask()))) - continue; - if (!IsSinker(I, OpIdx.index())) - continue; - - // All uses of the shuffle should be sunk to avoid duplicating it across gpr - // and vector registers - for (Use &U : Op->uses()) { - Instruction *Insn = cast<Instruction>(U.getUser()); - if (!IsSinker(Insn, U.getOperandNo())) - return false; - } - - Ops.push_back(&Shuffle->getOperandUse(0)); - if (Shuffle != Op) - Ops.push_back(&Op->getOperandUse(0)); - Ops.push_back(&OpIdx.value()); + int Op = 0; + if (!isa<ShuffleVectorInst>(I->getOperand(Op))) + Op = 1; + if (!IsSinker(I, Op)) + return false; + if (!match(I->getOperand(Op), + m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), + m_Undef(), m_Zero()))) { + return false; + } + Instruction *Shuffle = cast<Instruction>(I->getOperand(Op)); + // All uses of the shuffle should be sunk to avoid duplicating it across gpr + // and vector registers + for (Use &U : Shuffle->uses()) { + Instruction *Insn = cast<Instruction>(U.getUser()); + if (!IsSinker(Insn, U.getOperandNo())) + return false; } + Ops.push_back(&Shuffle->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(Op)); return true; } -Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const { - if (!Subtarget->hasMVEIntegerOps()) - return nullptr; - Type *SVIType = SVI->getType(); - Type *ScalarType = SVIType->getScalarType(); - - if (ScalarType->isFloatTy()) - return Type::getInt32Ty(SVIType->getContext()); - if (ScalarType->isHalfTy()) - return Type::getInt16Ty(SVIType->getContext()); - return nullptr; -} - bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT VT = ExtVal.getValueType(); @@ -19192,9 +15024,6 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { return false; } - if (Subtarget->hasMVEIntegerOps()) - return true; - // Don't create a loadext if we can fold the extension into a wide/long // instruction. // If there's more than one user instruction, the loadext is desirable no @@ -19225,6 +15054,17 @@ bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { return true; } +int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { + if (isLegalAddressingMode(DL, AM, Ty, AS)) { + if (Subtarget->hasFPAO()) + return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster + return 0; + } + return -1; +} + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be /// expanded to FMAs when this method returns true, otherwise fmuladd is @@ -19521,31 +15361,6 @@ bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { return AbsImm >= 0 && AbsImm <= 255; } -// Return false to prevent folding -// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine, -// if the folding leads to worse code. -bool ARMTargetLowering::isMulAddWithConstProfitable(SDValue AddNode, - SDValue ConstNode) const { - // Let the DAGCombiner decide for vector types and large types. - const EVT VT = AddNode.getValueType(); - if (VT.isVector() || VT.getScalarSizeInBits() > 32) - return true; - - // It is worse if c0 is legal add immediate, while c1*c0 is not - // and has to be composed by at least two instructions. - const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1)); - const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode); - const int64_t C0 = C0Node->getSExtValue(); - APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue(); - if (!isLegalAddImmediate(C0) || isLegalAddImmediate(CA.getSExtValue())) - return true; - if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1) - return false; - - // Default to true and let the DAGCombiner decide. - return true; -} - static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, @@ -19630,7 +15445,7 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, return false; } -static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, +static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { @@ -19665,16 +15480,16 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, // (in BE/masked) type. Base = Ptr->getOperand(0); if (VT == MVT::v4i16) { - if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2)) + if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) return true; } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { if (IsInRange(RHSC, 0x80, 1)) return true; - } else if (Alignment >= 4 && + } else if (Align >= 4 && (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && IsInRange(RHSC, 0x80, 4)) return true; - else if (Alignment >= 2 && + else if (Align >= 2 && (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && IsInRange(RHSC, 0x80, 2)) return true; @@ -19696,28 +15511,28 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, EVT VT; SDValue Ptr; - Align Alignment; + unsigned Align; bool isSEXTLoad = false; bool IsMasked = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); - Alignment = LD->getAlign(); + Align = LD->getAlignment(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); - Alignment = ST->getAlign(); + Align = ST->getAlignment(); } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); - Alignment = LD->getAlign(); + Align = LD->getAlignment(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; IsMasked = true; } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); - Alignment = ST->getAlign(); + Align = ST->getAlignment(); IsMasked = true; } else return false; @@ -19726,9 +15541,9 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, bool isLegal = false; if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && - getMVEIndexedAddressParts( - Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked, - Subtarget->isLittle(), Base, Offset, isInc, DAG); + getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, + IsMasked, Subtarget->isLittle(), Base, + Offset, isInc, DAG); else { if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, @@ -19754,31 +15569,31 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; - Align Alignment; + unsigned Align; bool isSEXTLoad = false, isNonExt; bool IsMasked = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); - Alignment = LD->getAlign(); + Align = LD->getAlignment(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); - Alignment = ST->getAlign(); + Align = ST->getAlignment(); isNonExt = !ST->isTruncatingStore(); } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); - Alignment = LD->getAlign(); + Align = LD->getAlignment(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; IsMasked = true; } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); - Alignment = ST->getAlign(); + Align = ST->getAlignment(); isNonExt = !ST->isTruncatingStore(); IsMasked = true; } else @@ -19793,8 +15608,6 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); if (!RHS || RHS->getZExtValue() != 4) return false; - if (Alignment < Align(4)) - return false; Offset = Op->getOperand(1); Base = Op->getOperand(0); @@ -19806,7 +15619,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isLegal = false; if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && - getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked, + getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked, Subtarget->isLittle(), Base, Offset, isInc, DAG); else { @@ -19868,7 +15681,8 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, return; KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); - Known = KnownBits::commonBits(Known, KnownRHS); + Known.Zero &= KnownRHS.Zero; + Known.One &= KnownRHS.One; return; } case ISD::INTRINSIC_W_CHAIN: { @@ -19920,45 +15734,18 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, if (Op.getOpcode() == ARMISD::VGETLANEs) Known = Known.sext(DstSz); else { - Known = Known.zext(DstSz); + Known = Known.zext(DstSz, true /* extended bits are known zero */); } assert(DstSz == Known.getBitWidth()); break; } - case ARMISD::VMOVrh: { - KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); - assert(KnownOp.getBitWidth() == 16); - Known = KnownOp.zext(32); - break; - } - case ARMISD::CSINC: - case ARMISD::CSINV: - case ARMISD::CSNEG: { - KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); - KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); - - // The result is either: - // CSINC: KnownOp0 or KnownOp1 + 1 - // CSINV: KnownOp0 or ~KnownOp1 - // CSNEG: KnownOp0 or KnownOp1 * -1 - if (Op.getOpcode() == ARMISD::CSINC) - KnownOp1 = KnownBits::computeForAddSub( - true, false, KnownOp1, KnownBits::makeConstant(APInt(32, 1))); - else if (Op.getOpcode() == ARMISD::CSINV) - std::swap(KnownOp1.Zero, KnownOp1.One); - else if (Op.getOpcode() == ARMISD::CSNEG) - KnownOp1 = KnownBits::mul( - KnownOp1, KnownBits::makeConstant(APInt(32, -1))); - - Known = KnownBits::commonBits(KnownOp0, KnownOp1); - break; - } } } -bool ARMTargetLowering::targetShrinkDemandedConstant( - SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, - TargetLoweringOpt &TLO) const { +bool +ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, + const APInt &DemandedAPInt, + TargetLoweringOpt &TLO) const { // Delay optimization, so we don't have to deal with illegal types, or block // optimizations. if (!TLO.LegalOps) @@ -19983,7 +15770,7 @@ bool ARMTargetLowering::targetShrinkDemandedConstant( unsigned Mask = C->getZExtValue(); - unsigned Demanded = DemandedBits.getZExtValue(); + unsigned Demanded = DemandedAPInt.getZExtValue(); unsigned ShrunkMask = Mask & Demanded; unsigned ExpandedMask = Mask | ~Demanded; @@ -20038,43 +15825,6 @@ bool ARMTargetLowering::targetShrinkDemandedConstant( return false; } -bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( - SDValue Op, const APInt &OriginalDemandedBits, - const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, - unsigned Depth) const { - unsigned Opc = Op.getOpcode(); - - switch (Opc) { - case ARMISD::ASRL: - case ARMISD::LSRL: { - // If this is result 0 and the other result is unused, see if the demand - // bits allow us to shrink this long shift into a standard small shift in - // the opposite direction. - if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) && - isa<ConstantSDNode>(Op->getOperand(2))) { - unsigned ShAmt = Op->getConstantOperandVal(2); - if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32) - << (32 - ShAmt))) - return TLO.CombineTo( - Op, TLO.DAG.getNode( - ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1), - TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32))); - } - break; - } - case ARMISD::VBICIMM: { - SDValue Op0 = Op.getOperand(0); - unsigned ModImm = Op.getConstantOperandVal(1); - unsigned EltBits = 0; - uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits); - if ((OriginalDemandedBits & Mask) == 0) - return TLO.CombineTo(Op, Op0); - } - } - - return TargetLowering::SimplifyDemandedBitsForTargetNode( - Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); -} //===----------------------------------------------------------------------===// // ARM Inline Assembly Support @@ -20085,7 +15835,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { if (!Subtarget->hasV6Ops()) return false; - InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); + InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); std::string AsmStr = IA->getAsmString(); SmallVector<StringRef, 4> AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); @@ -20093,7 +15843,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { switch (AsmPieces.size()) { default: return false; case 1: - AsmStr = std::string(AsmPieces[0]); + AsmStr = AsmPieces[0]; AsmPieces.clear(); SplitString(AsmStr, AsmPieces, " \t,"); @@ -20217,8 +15967,6 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint( case 'w': if (VT == MVT::Other) break; - if (VT == MVT::f16 || VT == MVT::bf16) - return RCPair(0U, &ARM::HPRRegClass); if (VT == MVT::f32) return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) @@ -20239,8 +15987,6 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint( case 't': if (VT == MVT::Other) break; - if (VT == MVT::f16 || VT == MVT::bf16) - return RCPair(0U, &ARM::HPRRegClass); if (VT == MVT::f32 || VT == MVT::i32) return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) @@ -20268,7 +16014,7 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint( break; } - if (StringRef("{cc}").equals_insensitive(Constraint)) + if (StringRef("{cc}").equals_lower(Constraint)) return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); @@ -20492,21 +16238,8 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { "Invalid opcode for Div/Rem lowering"); bool isSigned = (Opcode == ISD::SDIVREM); EVT VT = Op->getValueType(0); - SDLoc dl(Op); - - if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) { - SmallVector<SDValue> Result; - if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) { - SDValue Res0 = - DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]); - SDValue Res1 = - DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]); - return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), - {Res0, Res1}); - } - } - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + SDLoc dl(Op); // If the target has hardware divide, use divide + multiply + subtract: // div = a / b @@ -20556,20 +16289,11 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { // Lowers REM using divmod helpers // see RTABI section 4.2/4.3 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { - EVT VT = N->getValueType(0); - - if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) { - SmallVector<SDValue> Result; - if (expandDIVREMByConstant(N, Result, MVT::i32, DAG)) - return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0), - Result[0], Result[1]); - } - // Build return types (div and rem) std::vector<Type*> RetTyParams; Type *RetTyElement; - switch (VT.getSimpleVT().SimpleTy) { + switch (N->getValueType(0).getSimpleVT().SimpleTy) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; @@ -20618,15 +16342,13 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const if (DAG.getMachineFunction().getFunction().hasFnAttribute( "no-stack-arg-probe")) { - MaybeAlign Align = - cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); + unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); if (Align) - SP = - DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32)); + SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); SDValue Ops[2] = { SP, Chain }; return DAG.getMergeValues(Ops, DL); @@ -20741,6 +16463,38 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; } +void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); + MVT HalfT = MVT::i32; + SDLoc dl(N); + SDValue Hi, Lo, Tmp; + + if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || + !isOperationLegalOrCustom(ISD::UADDO, HalfT)) + return ; + + unsigned OpTypeBits = HalfT.getScalarSizeInBits(); + SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); + + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(0, dl, HalfT)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(1, dl, HalfT)); + + Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, + DAG.getConstant(OpTypeBits - 1, dl, + getShiftAmountTy(HalfT, DAG.getDataLayout()))); + Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); + Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, + SDValue(Lo.getNode(), 1)); + Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); + Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); + + Results.push_back(Lo); + Results.push_back(Hi); +} + bool ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The ARM target isn't yet aware of offsets. @@ -20765,9 +16519,6 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, return false; if (VT == MVT::f16 && Subtarget->hasFullFP16()) return ARM_AM::getFP16Imm(Imm) != -1; - if (VT == MVT::f32 && Subtarget->hasFullFP16() && - ARM_AM::getFP32FP16Imm(Imm) != -1) - return true; if (VT == MVT::f32) return ARM_AM::getFP32Imm(Imm) != -1; if (VT == MVT::f64 && Subtarget->hasFP64()) @@ -20800,8 +16551,8 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Value *AlignArg = I.getArgOperand(I.arg_size() - 1); - Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; @@ -20814,7 +16565,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); - Info.ptrVal = I.getArgOperand(I.arg_size() - 1); + Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; Info.align.reset(); // volatile loads with NEON intrinsics not supported @@ -20832,7 +16583,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, // Conservatively set memVT to the entire set of vectors stored. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); unsigned NumElts = 0; - for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) { + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; @@ -20841,8 +16592,8 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Value *AlignArg = I.getArgOperand(I.arg_size() - 1); - Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; @@ -20854,7 +16605,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, // Conservatively set memVT to the entire set of vectors stored. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); unsigned NumElts = 0; - for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) { + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; @@ -20868,115 +16619,27 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore; return true; } - case Intrinsic::arm_mve_vld2q: - case Intrinsic::arm_mve_vld4q: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - // Conservatively set memVT to the entire set of vectors loaded. - Type *VecTy = cast<StructType>(I.getType())->getElementType(1); - unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4; - Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.align = Align(VecTy->getScalarSizeInBits() / 8); - // volatile loads with MVE intrinsics not supported - Info.flags = MachineMemOperand::MOLoad; - return true; - } - case Intrinsic::arm_mve_vst2q: - case Intrinsic::arm_mve_vst4q: { - Info.opc = ISD::INTRINSIC_VOID; - // Conservatively set memVT to the entire set of vectors stored. - Type *VecTy = I.getArgOperand(1)->getType(); - unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4; - Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.align = Align(VecTy->getScalarSizeInBits() / 8); - // volatile stores with MVE intrinsics not supported - Info.flags = MachineMemOperand::MOStore; - return true; - } - case Intrinsic::arm_mve_vldr_gather_base: - case Intrinsic::arm_mve_vldr_gather_base_predicated: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.ptrVal = nullptr; - Info.memVT = MVT::getVT(I.getType()); - Info.align = Align(1); - Info.flags |= MachineMemOperand::MOLoad; - return true; - } - case Intrinsic::arm_mve_vldr_gather_base_wb: - case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.ptrVal = nullptr; - Info.memVT = MVT::getVT(I.getType()->getContainedType(0)); - Info.align = Align(1); - Info.flags |= MachineMemOperand::MOLoad; - return true; - } - case Intrinsic::arm_mve_vldr_gather_offset: - case Intrinsic::arm_mve_vldr_gather_offset_predicated: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.ptrVal = nullptr; - MVT DataVT = MVT::getVT(I.getType()); - unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue(); - Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), - DataVT.getVectorNumElements()); - Info.align = Align(1); - Info.flags |= MachineMemOperand::MOLoad; - return true; - } - case Intrinsic::arm_mve_vstr_scatter_base: - case Intrinsic::arm_mve_vstr_scatter_base_predicated: { - Info.opc = ISD::INTRINSIC_VOID; - Info.ptrVal = nullptr; - Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); - Info.align = Align(1); - Info.flags |= MachineMemOperand::MOStore; - return true; - } - case Intrinsic::arm_mve_vstr_scatter_base_wb: - case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.ptrVal = nullptr; - Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); - Info.align = Align(1); - Info.flags |= MachineMemOperand::MOStore; - return true; - } - case Intrinsic::arm_mve_vstr_scatter_offset: - case Intrinsic::arm_mve_vstr_scatter_offset_predicated: { - Info.opc = ISD::INTRINSIC_VOID; - Info.ptrVal = nullptr; - MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType()); - unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue(); - Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), - DataVT.getVectorNumElements()); - Info.align = Align(1); - Info.flags |= MachineMemOperand::MOStore; - return true; - } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); - Type *ValTy = I.getParamElementType(0); + PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(ValTy); + Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = DL.getABITypeAlign(ValTy); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } case Intrinsic::arm_stlex: case Intrinsic::arm_strex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); - Type *ValTy = I.getParamElementType(1); + PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(ValTy); + Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = DL.getABITypeAlign(ValTy); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -21027,7 +16690,7 @@ bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, return (Index == 0 || Index == ResVT.getVectorNumElements()); } -Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, +Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -21057,7 +16720,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, } // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html -Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, +Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { switch (Ord) { @@ -21070,7 +16733,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, case AtomicOrdering::SequentiallyConsistent: if (!Inst->hasAtomicStore()) return nullptr; // Nothing to do - [[fallthrough]]; + LLVM_FALLTHROUGH; case AtomicOrdering::Release: case AtomicOrdering::AcquireRelease: if (Subtarget->preferISHSTBarriers()) @@ -21082,7 +16745,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, llvm_unreachable("Unknown fence ordering in emitLeadingFence"); } -Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder, +Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { switch (Ord) { @@ -21104,19 +16767,9 @@ Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder, // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit // anything for those. -TargetLoweringBase::AtomicExpansionKind -ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { - bool has64BitAtomicStore; - if (Subtarget->isMClass()) - has64BitAtomicStore = false; - else if (Subtarget->isThumb()) - has64BitAtomicStore = Subtarget->hasV7Ops(); - else - has64BitAtomicStore = Subtarget->hasV6Ops(); - +bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); - return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand - : AtomicExpansionKind::None; + return (Size == 64) && !Subtarget->isMClass(); } // Loads and stores less than 64-bits are already atomic; ones above that @@ -21128,17 +16781,9 @@ ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // sections A8.8.72-74 LDRD) TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { - bool has64BitAtomicLoad; - if (Subtarget->isMClass()) - has64BitAtomicLoad = false; - else if (Subtarget->isThumb()) - has64BitAtomicLoad = Subtarget->hasV7Ops(); - else - has64BitAtomicLoad = Subtarget->hasV6Ops(); - unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly - : AtomicExpansionKind::None; + return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly + : AtomicExpansionKind::None; } // For the real atomic operations, we have ldrex/strex up to 32 bits, @@ -21149,28 +16794,12 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { return AtomicExpansionKind::CmpXChg; unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - bool hasAtomicRMW; - if (Subtarget->isMClass()) - hasAtomicRMW = Subtarget->hasV8MBaselineOps(); - else if (Subtarget->isThumb()) - hasAtomicRMW = Subtarget->hasV7Ops(); - else - hasAtomicRMW = Subtarget->hasV6Ops(); - if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) { - // At -O0, fast-regalloc cannot cope with the live vregs necessary to - // implement atomicrmw without spilling. If the target address is also on - // the stack and close enough to the spill slot, this can lead to a - // situation where the monitor always gets cleared and the atomic operation - // can never succeed. So at -O0 lower this operation to a CAS loop. - if (getTargetMachine().getOptLevel() == CodeGenOpt::None) - return AtomicExpansionKind::CmpXChg; - return AtomicExpansionKind::LLSC; - } - return AtomicExpansionKind::None; + bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); + return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) + ? AtomicExpansionKind::LLSC + : AtomicExpansionKind::None; } -// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32 -// bits, and up to 64 bits on the non-M profiles. TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { // At -O0, fast-regalloc cannot cope with the live vregs necessary to @@ -21178,16 +16807,9 @@ ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. - unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits(); - bool HasAtomicCmpXchg; - if (Subtarget->isMClass()) - HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps(); - else if (Subtarget->isThumb()) - HasAtomicCmpXchg = Subtarget->hasV7Ops(); - else - HasAtomicCmpXchg = Subtarget->hasV6Ops(); - if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg && - Size <= (Subtarget->isMClass() ? 32U : 64U)) + bool HasAtomicCmpXchg = + !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); + if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg) return AtomicExpansionKind::LLSC; return AtomicExpansionKind::None; } @@ -21197,9 +16819,9 @@ bool ARMTargetLowering::shouldInsertFencesForAtomic( return InsertFencesForAtomic; } +// This has so far only been implemented for MachO. bool ARMTargetLowering::useLoadStackGuardNode() const { - // ROPI/RWPI are not supported currently. - return !Subtarget->isROPI() && !Subtarget->isRWPI(); + return Subtarget->isTargetMachO(); } void ARMTargetLowering::insertSSPDeclarations(Module &M) const { @@ -21215,7 +16837,7 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const { "__security_check_cookie", Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext())); if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) - F->addParamAttr(0, Attribute::AttrKind::InReg); + F->addAttribute(1, Attribute::AttrKind::InReg); } Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { @@ -21251,7 +16873,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); - unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue(); + unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); // We can do a store + vector extract on any vector that fits perfectly in a D // or Q register. if (BitWidth == 64 || BitWidth == 128) { @@ -21261,48 +16883,28 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; } -bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { +bool ARMTargetLowering::isCheapToSpeculateCttz() const { return Subtarget->hasV6T2Ops(); } -bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { +bool ARMTargetLowering::isCheapToSpeculateCtlz() const { return Subtarget->hasV6T2Ops(); } -bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial( - const Instruction &AndI) const { - if (!Subtarget->hasV7Ops()) - return false; - - // Sink the `and` instruction only if the mask would fit into a modified - // immediate operand. - ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); - if (!Mask || Mask->getValue().getBitWidth() > 32u) - return false; - auto MaskVal = unsigned(Mask->getValue().getZExtValue()); - return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal) - : ARM_AM::getSOImmVal(MaskVal)) != -1; +bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { + return !Subtarget->hasMinSize() || Subtarget->isTargetWindows(); } -TargetLowering::ShiftLegalizationStrategy -ARMTargetLowering::preferredShiftLegalizationStrategy( - SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const { - if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows()) - return ShiftLegalizationStrategy::LowerToLibcall; - return TargetLowering::preferredShiftLegalizationStrategy(DAG, N, - ExpansionFactor); -} - -Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, - Value *Addr, +Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); bool IsAcquire = isAcquireOrStronger(Ord); // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i32, i32} and we have to recombine them into a // single i64 here. - if (ValueTy->getPrimitiveSizeInBits() == 64) { + if (ValTy->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; Function *Ldrex = Intrinsic::getDeclaration(M, Int); @@ -21314,32 +16916,31 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); if (!Subtarget->isLittle()) std::swap (Lo, Hi); - Lo = Builder.CreateZExt(Lo, ValueTy, "lo64"); - Hi = Builder.CreateZExt(Hi, ValueTy, "hi64"); + Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); + Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); return Builder.CreateOr( - Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64"); + Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); } Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); - CallInst *CI = Builder.CreateCall(Ldrex, Addr); - CI->addParamAttr( - 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy)); - return Builder.CreateTruncOrBitCast(CI, ValueTy); + return Builder.CreateTruncOrBitCast( + Builder.CreateCall(Ldrex, Addr), + cast<PointerType>(Addr->getType())->getElementType()); } void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( - IRBuilderBase &Builder) const { + IRBuilder<> &Builder) const { if (!Subtarget->hasV7Ops()) return; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); } -Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, - Value *Val, Value *Addr, +Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, + Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); bool IsRelease = isReleaseOrStronger(Ord); @@ -21365,13 +16966,10 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, Type *Tys[] = { Addr->getType() }; Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); - CallInst *CI = Builder.CreateCall( + return Builder.CreateCall( Strex, {Builder.CreateZExtOrBitCast( Val, Strex->getFunctionType()->getParamType(0)), Addr}); - CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType, - Val->getType())); - return CI; } @@ -21388,8 +16986,7 @@ ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, } bool ARMTargetLowering::isLegalInterleavedAccessType( - unsigned Factor, FixedVectorType *VecTy, Align Alignment, - const DataLayout &DL) const { + unsigned Factor, VectorType *VecTy, const DataLayout &DL) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); @@ -21412,9 +17009,6 @@ bool ARMTargetLowering::isLegalInterleavedAccessType( // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32) return false; - // And the alignment if high enough under MVE. - if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8) - return false; // Ensure the total vector size is 64 or a multiple of 128. Types larger than // 128 will be split into multiple interleaved accesses. @@ -21451,16 +17045,15 @@ bool ARMTargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); - auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); - Type *EltTy = VecTy->getElementType(); + VectorType *VecTy = Shuffles[0]->getType(); + Type *EltTy = VecTy->getVectorElementType(); const DataLayout &DL = LI->getModule()->getDataLayout(); - Align Alignment = LI->getAlign(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL)) + if (!isLegalInterleavedAccessType(Factor, VecTy, DL)) return false; unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); @@ -21468,7 +17061,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. if (EltTy->isPointerTy()) - VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy); + VecTy = + VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); IRBuilder<> Builder(LI); @@ -21478,15 +17072,15 @@ bool ARMTargetLowering::lowerInterleavedLoad( if (NumLoads > 1) { // If we're going to generate more than one load, reset the sub-vector type // to something legal. - VecTy = FixedVectorType::get(VecTy->getElementType(), - VecTy->getNumElements() / NumLoads); + VecTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / NumLoads); // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, - VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); + BaseAddr, VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace())); } assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); @@ -21503,7 +17097,7 @@ bool ARMTargetLowering::lowerInterleavedLoad( SmallVector<Value *, 2> Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); - Ops.push_back(Builder.getInt32(LI->getAlign().value())); + Ops.push_back(Builder.getInt32(LI->getAlignment())); return Builder.CreateCall(VldnFunc, Ops, "vldN"); } else { @@ -21511,8 +17105,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( "expected interleave factor of 2 or 4 for MVE"); Intrinsic::ID LoadInts = Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; - Type *VecEltTy = - VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()); + Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace()); Type *Tys[] = {VecTy, VecEltTy}; Function *VldnFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); @@ -21532,8 +17126,9 @@ bool ARMTargetLowering::lowerInterleavedLoad( // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) - BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr, - VecTy->getNumElements() * Factor); + BaseAddr = + Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, + VecTy->getVectorNumElements() * Factor); CallInst *VldN = createLoadIntrinsic(BaseAddr); @@ -21548,8 +17143,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( - SubVec, - FixedVectorType::get(SV->getType()->getElementType(), VecTy)); + SubVec, VectorType::get(SV->getType()->getVectorElementType(), + VecTy->getVectorNumElements())); SubVecs[SV].push_back(SubVec); } @@ -21601,20 +17196,20 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); - auto *VecTy = cast<FixedVectorType>(SVI->getType()); - assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); + VectorType *VecTy = SVI->getType(); + assert(VecTy->getVectorNumElements() % Factor == 0 && + "Invalid interleaved store"); - unsigned LaneLen = VecTy->getNumElements() / Factor; - Type *EltTy = VecTy->getElementType(); - auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); + unsigned LaneLen = VecTy->getVectorNumElements() / Factor; + Type *EltTy = VecTy->getVectorElementType(); + VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); - Align Alignment = SI->getAlign(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) + if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); @@ -21629,12 +17224,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, Type *IntTy = DL.getIntPtrType(EltTy); // Convert to the corresponding integer vector. - auto *IntVecTy = - FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType())); + Type *IntVecTy = + VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); - SubVecTy = FixedVectorType::get(IntTy, LaneLen); + SubVecTy = VectorType::get(IntTy, LaneLen); } // The base address of the store. @@ -21644,14 +17239,14 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // If we're going to generate more than one store, reset the lane length // and sub-vector type to something legal. LaneLen /= NumStores; - SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); + SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, - SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); + BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( + SI->getPointerAddressSpace())); } assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); @@ -21672,15 +17267,16 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, SmallVector<Value *, 6> Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); - append_range(Ops, Shuffles); - Ops.push_back(Builder.getInt32(SI->getAlign().value())); + for (auto S : Shuffles) + Ops.push_back(S); + Ops.push_back(Builder.getInt32(SI->getAlignment())); Builder.CreateCall(VstNFunc, Ops); } else { assert((Factor == 2 || Factor == 4) && "expected interleave factor of 2 or 4 for MVE"); Intrinsic::ID StoreInts = Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; - Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo( + Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo( SI->getPointerAddressSpace()); Type *Tys[] = {EltPtrTy, SubVecTy}; Function *VstNFunc = @@ -21688,7 +17284,8 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, SmallVector<Value *, 6> Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); - append_range(Ops, Shuffles); + for (auto S : Shuffles) + Ops.push_back(S); for (unsigned F = 0; F < Factor; F++) { Ops.push_back(Builder.getInt32(F)); Builder.CreateCall(VstNFunc, Ops); @@ -21701,7 +17298,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) - BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), + BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), BaseAddr, LaneLen * Factor); SmallVector<Value *, 4> Shuffles; @@ -21711,7 +17308,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { Shuffles.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); + Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { @@ -21728,7 +17325,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask Shuffles.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); + Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } } @@ -21776,11 +17373,11 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, case HA_DOUBLE: return false; case HA_VECT64: - return VT->getPrimitiveSizeInBits().getFixedValue() == 64; + return VT->getBitWidth() == 64; case HA_VECT128: - return VT->getPrimitiveSizeInBits().getFixedValue() == 128; + return VT->getBitWidth() == 128; case HA_UNKNOWN: - switch (VT->getPrimitiveSizeInBits().getFixedValue()) { + switch (VT->getBitWidth()) { case 64: Base = HA_VECT64; return true; @@ -21797,9 +17394,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, } /// Return the correct alignment for the current calling convention. -Align ARMTargetLowering::getABIAlignmentForCallingConv( - Type *ArgTy, const DataLayout &DL) const { - const Align ABITypeAlign = DL.getABITypeAlign(ArgTy); +Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, + DataLayout DL) const { + const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy)); if (!ArgTy->isVectorTy()) return ABITypeAlign; @@ -21812,8 +17409,7 @@ Align ARMTargetLowering::getABIAlignmentForCallingConv( /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when /// passing according to AAPCS rules. bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( - Type *Ty, CallingConv::ID CallConv, bool isVarArg, - const DataLayout &DL) const { + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { if (getEffectiveCallingConv(CallConv, isVarArg) != CallingConv::ARM_AAPCS_VFP) return false; @@ -21827,18 +17423,18 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( return IsHA || IsIntArray; } -Register ARMTargetLowering::getExceptionPointerRegister( +unsigned ARMTargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. - return Subtarget->useSjLjEH() ? Register() : ARM::R0; + return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; } -Register ARMTargetLowering::getExceptionSelectorRegister( +unsigned ARMTargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. - return Subtarget->useSjLjEH() ? Register() : ARM::R1; + return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; } void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { @@ -21892,105 +17488,3 @@ void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } - -bool ARMTargetLowering::isComplexDeinterleavingSupported() const { - return Subtarget->hasMVEIntegerOps(); -} - -bool ARMTargetLowering::isComplexDeinterleavingOperationSupported( - ComplexDeinterleavingOperation Operation, Type *Ty) const { - auto *VTy = dyn_cast<FixedVectorType>(Ty); - if (!VTy) - return false; - - auto *ScalarTy = VTy->getScalarType(); - unsigned NumElements = VTy->getNumElements(); - - unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; - if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth)) - return false; - - // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32 - if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy()) - return Subtarget->hasMVEFloatOps(); - - if (Operation != ComplexDeinterleavingOperation::CAdd) - return false; - - return Subtarget->hasMVEIntegerOps() && - (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) || - ScalarTy->isIntegerTy(32)); -} - -Value *ARMTargetLowering::createComplexDeinterleavingIR( - Instruction *I, ComplexDeinterleavingOperation OperationType, - ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, - Value *Accumulator) const { - - FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType()); - - IRBuilder<> B(I); - - unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); - - assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits"); - - if (TyWidth > 128) { - int Stride = Ty->getNumElements() / 2; - auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements()); - auto SplitSeqVec = llvm::to_vector(SplitSeq); - ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride); - ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride); - - auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask); - auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask); - auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask); - auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask); - Value *LowerSplitAcc = nullptr; - Value *UpperSplitAcc = nullptr; - - if (Accumulator) { - LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask); - UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask); - } - - auto *LowerSplitInt = createComplexDeinterleavingIR( - I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); - auto *UpperSplitInt = createComplexDeinterleavingIR( - I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); - - ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements()); - return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); - } - - auto *IntTy = Type::getInt32Ty(B.getContext()); - - ConstantInt *ConstRotation = nullptr; - if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { - ConstRotation = ConstantInt::get(IntTy, (int)Rotation); - - if (Accumulator) - return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty, - {ConstRotation, Accumulator, InputB, InputA}); - return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty, - {ConstRotation, InputB, InputA}); - } - - if (OperationType == ComplexDeinterleavingOperation::CAdd) { - // 1 means the value is not halved. - auto *ConstHalving = ConstantInt::get(IntTy, 1); - - if (Rotation == ComplexDeinterleavingRotation::Rotation_90) - ConstRotation = ConstantInt::get(IntTy, 0); - else if (Rotation == ComplexDeinterleavingRotation::Rotation_270) - ConstRotation = ConstantInt::get(IntTy, 1); - - if (!ConstRotation) - return nullptr; // Invalid rotation for arm_mve_vcaddq - - return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty, - {ConstHalving, ConstRotation, InputA, InputB}); - } - - return nullptr; -} |