1 files changed, 1506 insertions, 6012 deletions
diff --git a/gnu/llvm/llvm/lib/Target/ARM/ARMISelLowering.cpp b/gnu/llvm/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2e78b52d099..66f3f418d06 100644
--- a/gnu/llvm/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/gnu/llvm/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21,7 +21,6 @@
 #include "ARMRegisterInfo.h"
 #include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
-#include "ARMTargetTransformInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "Utils/ARMBaseInfo.h"
@@ -55,7 +54,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -110,7 +108,6 @@
 #include <cstdlib>
 #include <iterator>
 #include <limits>
-#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -146,7 +143,7 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
     cl::init(128));
 
-cl::opt<unsigned>
+static cl::opt<unsigned>
 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
   cl::desc("Maximum interleave factor for MVE VLDn to generate."),
   cl::init(2));
@@ -156,7 +153,8 @@ static const MCPhysReg GPRArgRegs[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
 };
 
-void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
+void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
+                                       MVT PromotedBitwiseVT) {
   if (VT != PromotedLdStVT) {
     setOperationAction(ISD::LOAD, VT, Promote);
     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
@@ -195,6 +193,16 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
     setOperationAction(ISD::SRL, VT, Custom);
   }
 
+  // Promote all bit-wise operations.
+  if (VT.isInteger() && VT != PromotedBitwiseVT) {
+    setOperationAction(ISD::AND, VT, Promote);
+    AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
+    setOperationAction(ISD::OR,  VT, Promote);
+    AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
+    setOperationAction(ISD::XOR, VT, Promote);
+    AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
+  }
+
   // Neon does not support vector divide/remainder operations.
   setOperationAction(ISD::SDIV, VT, Expand);
   setOperationAction(ISD::UDIV, VT, Expand);
@@ -202,8 +210,6 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
   setOperationAction(ISD::SREM, VT, Expand);
   setOperationAction(ISD::UREM, VT, Expand);
   setOperationAction(ISD::FREM, VT, Expand);
-  setOperationAction(ISD::SDIVREM, VT, Expand);
-  setOperationAction(ISD::UDIVREM, VT, Expand);
 
   if (!VT.isFloatingPoint() &&
       VT != MVT::v2i64 && VT != MVT::v1i64)
@@ -216,12 +222,12 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
 
 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
   addRegisterClass(VT, &ARM::DPRRegClass);
-  addTypeForNEON(VT, MVT::f64);
+  addTypeForNEON(VT, MVT::f64, MVT::v2i32);
 }
 
 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
   addRegisterClass(VT, &ARM::DPairRegClass);
-  addTypeForNEON(VT, MVT::v2f64);
+  addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 }
 
 void ARMTargetLowering::setAllExpand(MVT VT) {
@@ -272,23 +278,13 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::UADDSAT, VT, Legal);
     setOperationAction(ISD::SSUBSAT, VT, Legal);
     setOperationAction(ISD::USUBSAT, VT, Legal);
-    setOperationAction(ISD::ABDS, VT, Legal);
-    setOperationAction(ISD::ABDU, VT, Legal);
-    setOperationAction(ISD::AVGFLOORS, VT, Legal);
-    setOperationAction(ISD::AVGFLOORU, VT, Legal);
-    setOperationAction(ISD::AVGCEILS, VT, Legal);
-    setOperationAction(ISD::AVGCEILU, VT, Legal);
 
     // No native support for these.
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::SDIV, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
-    setOperationAction(ISD::UDIVREM, VT, Expand);
-    setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::CTPOP, VT, Expand);
-    setOperationAction(ISD::SELECT, VT, Expand);
-    setOperationAction(ISD::SELECT_CC, VT, Expand);
 
     // Vector reductions
     setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
@@ -296,19 +292,12 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
     setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
     setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
-    setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
-    setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
-    setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
-    setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
 
     if (!HasMVEFP) {
       setOperationAction(ISD::SINT_TO_FP, VT, Expand);
       setOperationAction(ISD::UINT_TO_FP, VT, Expand);
       setOperationAction(ISD::FP_TO_SINT, VT, Expand);
       setOperationAction(ISD::FP_TO_UINT, VT, Expand);
-    } else {
-      setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
-      setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
     }
 
     // Pre and Post inc are supported on loads and stores
@@ -338,8 +327,6 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::SETCC, VT, Custom);
     setOperationAction(ISD::MLOAD, VT, Custom);
     setOperationAction(ISD::MSTORE, VT, Legal);
-    setOperationAction(ISD::SELECT, VT, Expand);
-    setOperationAction(ISD::SELECT_CC, VT, Expand);
 
     // Pre and Post inc are supported on loads and stores
     for (unsigned im = (unsigned)ISD::PRE_INC;
@@ -354,10 +341,6 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
       setOperationAction(ISD::FMINNUM, VT, Legal);
       setOperationAction(ISD::FMAXNUM, VT, Legal);
       setOperationAction(ISD::FROUND, VT, Legal);
-      setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
 
       // No native support for these.
       setOperationAction(ISD::FDIV, VT, Expand);
@@ -375,17 +358,6 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     }
   }
 
-  // Custom Expand smaller than legal vector reductions to prevent false zero
-  // items being added.
-  setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
-  setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
-  setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
-  setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
-  setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
-  setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
-  setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
-  setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
-
   // We 'support' these types up to bitcast/load/store level, regardless of
   // MVE integer-only / float support. Only doing FP data processing on the FP
   // vector types is inhibited at integer-only level.
@@ -396,11 +368,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
-    setOperationAction(ISD::VSELECT, VT, Legal);
-    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
   }
-  setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
-
   // We can do bitwise operations on v2i64 vectors
   setOperationAction(ISD::AND, MVT::v2i64, Legal);
   setOperationAction(ISD::OR, MVT::v2i64, Legal);
@@ -435,7 +403,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
   }
 
   // Predicate types
-  const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
+  const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1};
   for (auto VT : pTypes) {
     addRegisterClass(VT, &ARM::VCCRRegClass);
     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
@@ -448,36 +416,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
     setOperationAction(ISD::LOAD, VT, Custom);
     setOperationAction(ISD::STORE, VT, Custom);
-    setOperationAction(ISD::TRUNCATE, VT, Custom);
-    setOperationAction(ISD::VSELECT, VT, Expand);
-    setOperationAction(ISD::SELECT, VT, Expand);
-    setOperationAction(ISD::SELECT_CC, VT, Expand);
-
-    if (!HasMVEFP) {
-      setOperationAction(ISD::SINT_TO_FP, VT, Expand);
-      setOperationAction(ISD::UINT_TO_FP, VT, Expand);
-      setOperationAction(ISD::FP_TO_SINT, VT, Expand);
-      setOperationAction(ISD::FP_TO_UINT, VT, Expand);
-    }
   }
-  setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
-  setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand);
-  setOperationAction(ISD::AND, MVT::v2i1, Expand);
-  setOperationAction(ISD::OR, MVT::v2i1, Expand);
-  setOperationAction(ISD::XOR, MVT::v2i1, Expand);
-  setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Expand);
-  setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Expand);
-  setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Expand);
-
-  setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
-  setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
-  setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
-  setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
-  setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
-  setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
-  setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
-  setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
 }
 
 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
@@ -490,7 +429,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
-      !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
+      !Subtarget->isTargetWatchOS()) {
     bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
     for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
       setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
@@ -572,9 +511,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setLibcallName(RTLIB::SHL_I128, nullptr);
   setLibcallName(RTLIB::SRL_I128, nullptr);
   setLibcallName(RTLIB::SRA_I128, nullptr);
-  setLibcallName(RTLIB::MUL_I128, nullptr);
-  setLibcallName(RTLIB::MULO_I64, nullptr);
-  setLibcallName(RTLIB::MULO_I128, nullptr);
 
   // RTLIB
   if (Subtarget->isAAPCS_ABI() &&
@@ -772,12 +708,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       Subtarget->hasFPRegs()) {
     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
-
-    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
-    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
-    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
-
     if (!Subtarget->hasVFP2Base())
       setAllExpand(MVT::f32);
     if (!Subtarget->hasFP64())
@@ -787,26 +717,22 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->hasFullFP16()) {
     addRegisterClass(MVT::f16, &ARM::HPRRegClass);
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+    setOperationAction(ISD::BITCAST, MVT::i32, Custom);
     setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 
     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
   }
 
-  if (Subtarget->hasBF16()) {
-    addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
-    setAllExpand(MVT::bf16);
-    if (!Subtarget->hasFullFP16())
-      setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
-  }
-
   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
     for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
       setTruncStoreAction(VT, InnerVT, Expand);
       addAllExtLoads(VT, InnerVT, Expand);
     }
 
+    setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
     setOperationAction(ISD::BSWAP, VT, Expand);
@@ -823,7 +749,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   // Combine low-overhead loop intrinsics so that we can lower i1 types.
   if (Subtarget->hasLOB()) {
-    setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});
+    setTargetDAGCombine(ISD::BRCOND);
+    setTargetDAGCombine(ISD::BR_CC);
   }
 
   if (Subtarget->hasNEON()) {
@@ -844,11 +771,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       addQRTypeForNEON(MVT::v8f16);
       addDRTypeForNEON(MVT::v4f16);
     }
-
-    if (Subtarget->hasBF16()) {
-      addQRTypeForNEON(MVT::v8bf16);
-      addDRTypeForNEON(MVT::v4bf16);
-    }
   }
 
   if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
@@ -984,19 +906,22 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
 
-    for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
-      setOperationAction(ISD::MULHS, VT, Expand);
-      setOperationAction(ISD::MULHU, VT, Expand);
-    }
-
     // NEON only has FMA instructions as of VFP4.
     if (!Subtarget->hasVFP4Base()) {
       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
     }
 
-    setTargetDAGCombine({ISD::SHL, ISD::SRL, ISD::SRA, ISD::FP_TO_SINT,
-                         ISD::FP_TO_UINT, ISD::FDIV, ISD::LOAD});
+    setTargetDAGCombine(ISD::INTRINSIC_VOID);
+    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+    setTargetDAGCombine(ISD::SHL);
+    setTargetDAGCombine(ISD::SRL);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::FP_TO_SINT);
+    setTargetDAGCombine(ISD::FP_TO_UINT);
+    setTargetDAGCombine(ISD::FDIV);
+    setTargetDAGCombine(ISD::LOAD);
 
     // It is legal to extload from v4i8 to v4i16 or v4i32.
     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
@@ -1010,20 +935,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   }
 
   if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
-    setTargetDAGCombine(
-        {ISD::BUILD_VECTOR, ISD::VECTOR_SHUFFLE, ISD::INSERT_SUBVECTOR,
-         ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
-         ISD::SIGN_EXTEND_INREG, ISD::STORE, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,
-         ISD::ANY_EXTEND, ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN,
-         ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});
-  }
-  if (Subtarget->hasMVEIntegerOps()) {
-    setTargetDAGCombine({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
-                         ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,
-                         ISD::SETCC});
-  }
-  if (Subtarget->hasMVEFloatOps()) {
-    setTargetDAGCombine(ISD::FADD);
+    setTargetDAGCombine(ISD::BUILD_VECTOR);
+    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+    setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+    setTargetDAGCombine(ISD::STORE);
+    setTargetDAGCombine(ISD::SIGN_EXTEND);
+    setTargetDAGCombine(ISD::ZERO_EXTEND);
+    setTargetDAGCombine(ISD::ANY_EXTEND);
   }
 
   if (!Subtarget->hasFP64()) {
@@ -1131,10 +1049,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
     setOperationAction(ISD::SADDSAT, MVT::i16, Custom);
     setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
-    setOperationAction(ISD::UADDSAT, MVT::i8, Custom);
-    setOperationAction(ISD::USUBSAT, MVT::i8, Custom);
-    setOperationAction(ISD::UADDSAT, MVT::i16, Custom);
-    setOperationAction(ISD::USUBSAT, MVT::i16, Custom);
   }
   if (Subtarget->hasBaseDSP()) {
     setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
@@ -1159,8 +1073,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRA,       MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
-  setOperationAction(ISD::LOAD, MVT::i64, Custom);
-  setOperationAction(ISD::STORE, MVT::i64, Custom);
 
   // MVE lowers 64 bit shifts to lsll and lsrl
   // assuming that ISD::SRL and SRA of i64 are already marked custom
@@ -1357,32 +1269,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     }
   }
 
-  // Compute supported atomic widths.
-  if (Subtarget->isTargetLinux() ||
-      (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
-    // For targets where __sync_* routines are reliably available, we use them
-    // if necessary.
-    //
-    // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
-    // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
-    //
-    // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
-    // such targets should provide __sync_* routines, which use the ARM mode
-    // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
-    // encoding; see ARMISD::MEMBARRIER_MCR.)
-    setMaxAtomicSizeInBitsSupported(64);
-  } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
-             Subtarget->hasForced32BitAtomics()) {
-    // Cortex-M (besides Cortex-M0) have 32-bit atomics.
-    setMaxAtomicSizeInBitsSupported(32);
-  } else {
-    // We can't assume anything about other targets; just use libatomic
-    // routines.
-    setMaxAtomicSizeInBitsSupported(0);
-  }
-
-  setMaxDivRemBitWidthSupported(64);
-
   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
 
   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
@@ -1397,8 +1283,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
     // iff target supports vfp2.
     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
-    setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
-    setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
+    setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
   }
 
   // We want to custom lower some of our intrinsics.
@@ -1534,16 +1419,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   }
 
   if (Subtarget->hasNEON()) {
-    // vmin and vmax aren't available in a scalar form, so we can use
-    // a NEON instruction with an undef lane instead.  This has a performance
-    // penalty on some cores, so we don't do this unless we have been
-    // asked to by the core tuning model.
-    if (Subtarget->useNEONForSinglePrecisionFP()) {
-      setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
-      setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
-      setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
-      setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
-    }
+    // vmin and vmax aren't available in a scalar form, so we use
+    // a NEON instruction with an undef lane instead.
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
     setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
@@ -1564,21 +1445,17 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   // We have target-specific dag combine patterns for the following nodes:
   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
-  setTargetDAGCombine(
-      {ISD::ADD, ISD::SUB, ISD::MUL, ISD::AND, ISD::OR, ISD::XOR});
-
-  if (Subtarget->hasMVEIntegerOps())
-    setTargetDAGCombine(ISD::VSELECT);
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::XOR);
 
   if (Subtarget->hasV6Ops())
     setTargetDAGCombine(ISD::SRL);
   if (Subtarget->isThumb1Only())
     setTargetDAGCombine(ISD::SHL);
-  // Attempt to lower smin/smax to ssat/usat
-  if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
-      Subtarget->isThumb2()) {
-    setTargetDAGCombine({ISD::SMIN, ISD::SMAX});
-  }
 
   setStackPointerRegisterToSaveRestore(ARM::SP);
 
@@ -1664,216 +1541,170 @@ ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
 }
 
 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
-#define MAKE_CASE(V)                                                           \
-  case V:                                                                      \
-    return #V;
   switch ((ARMISD::NodeType)Opcode) {
-  case ARMISD::FIRST_NUMBER:
-    break;
-    MAKE_CASE(ARMISD::Wrapper)
-    MAKE_CASE(ARMISD::WrapperPIC)
-    MAKE_CASE(ARMISD::WrapperJT)
-    MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL)
-    MAKE_CASE(ARMISD::CALL)
-    MAKE_CASE(ARMISD::CALL_PRED)
-    MAKE_CASE(ARMISD::CALL_NOLINK)
-    MAKE_CASE(ARMISD::tSECALL)
-    MAKE_CASE(ARMISD::t2CALL_BTI)
-    MAKE_CASE(ARMISD::BRCOND)
-    MAKE_CASE(ARMISD::BR_JT)
-    MAKE_CASE(ARMISD::BR2_JT)
-    MAKE_CASE(ARMISD::RET_FLAG)
-    MAKE_CASE(ARMISD::SERET_FLAG)
-    MAKE_CASE(ARMISD::INTRET_FLAG)
-    MAKE_CASE(ARMISD::PIC_ADD)
-    MAKE_CASE(ARMISD::CMP)
-    MAKE_CASE(ARMISD::CMN)
-    MAKE_CASE(ARMISD::CMPZ)
-    MAKE_CASE(ARMISD::CMPFP)
-    MAKE_CASE(ARMISD::CMPFPE)
-    MAKE_CASE(ARMISD::CMPFPw0)
-    MAKE_CASE(ARMISD::CMPFPEw0)
-    MAKE_CASE(ARMISD::BCC_i64)
-    MAKE_CASE(ARMISD::FMSTAT)
-    MAKE_CASE(ARMISD::CMOV)
-    MAKE_CASE(ARMISD::SUBS)
-    MAKE_CASE(ARMISD::SSAT)
-    MAKE_CASE(ARMISD::USAT)
-    MAKE_CASE(ARMISD::ASRL)
-    MAKE_CASE(ARMISD::LSRL)
-    MAKE_CASE(ARMISD::LSLL)
-    MAKE_CASE(ARMISD::SRL_FLAG)
-    MAKE_CASE(ARMISD::SRA_FLAG)
-    MAKE_CASE(ARMISD::RRX)
-    MAKE_CASE(ARMISD::ADDC)
-    MAKE_CASE(ARMISD::ADDE)
-    MAKE_CASE(ARMISD::SUBC)
-    MAKE_CASE(ARMISD::SUBE)
-    MAKE_CASE(ARMISD::LSLS)
-    MAKE_CASE(ARMISD::VMOVRRD)
-    MAKE_CASE(ARMISD::VMOVDRR)
-    MAKE_CASE(ARMISD::VMOVhr)
-    MAKE_CASE(ARMISD::VMOVrh)
-    MAKE_CASE(ARMISD::VMOVSR)
-    MAKE_CASE(ARMISD::EH_SJLJ_SETJMP)
-    MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP)
-    MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH)
-    MAKE_CASE(ARMISD::TC_RETURN)
-    MAKE_CASE(ARMISD::THREAD_POINTER)
-    MAKE_CASE(ARMISD::DYN_ALLOC)
-    MAKE_CASE(ARMISD::MEMBARRIER_MCR)
-    MAKE_CASE(ARMISD::PRELOAD)
-    MAKE_CASE(ARMISD::LDRD)
-    MAKE_CASE(ARMISD::STRD)
-    MAKE_CASE(ARMISD::WIN__CHKSTK)
-    MAKE_CASE(ARMISD::WIN__DBZCHK)
-    MAKE_CASE(ARMISD::PREDICATE_CAST)
-    MAKE_CASE(ARMISD::VECTOR_REG_CAST)
-    MAKE_CASE(ARMISD::MVESEXT)
-    MAKE_CASE(ARMISD::MVEZEXT)
-    MAKE_CASE(ARMISD::MVETRUNC)
-    MAKE_CASE(ARMISD::VCMP)
-    MAKE_CASE(ARMISD::VCMPZ)
-    MAKE_CASE(ARMISD::VTST)
-    MAKE_CASE(ARMISD::VSHLs)
-    MAKE_CASE(ARMISD::VSHLu)
-    MAKE_CASE(ARMISD::VSHLIMM)
-    MAKE_CASE(ARMISD::VSHRsIMM)
-    MAKE_CASE(ARMISD::VSHRuIMM)
-    MAKE_CASE(ARMISD::VRSHRsIMM)
-    MAKE_CASE(ARMISD::VRSHRuIMM)
-    MAKE_CASE(ARMISD::VRSHRNIMM)
-    MAKE_CASE(ARMISD::VQSHLsIMM)
-    MAKE_CASE(ARMISD::VQSHLuIMM)
-    MAKE_CASE(ARMISD::VQSHLsuIMM)
-    MAKE_CASE(ARMISD::VQSHRNsIMM)
-    MAKE_CASE(ARMISD::VQSHRNuIMM)
-    MAKE_CASE(ARMISD::VQSHRNsuIMM)
-    MAKE_CASE(ARMISD::VQRSHRNsIMM)
-    MAKE_CASE(ARMISD::VQRSHRNuIMM)
-    MAKE_CASE(ARMISD::VQRSHRNsuIMM)
-    MAKE_CASE(ARMISD::VSLIIMM)
-    MAKE_CASE(ARMISD::VSRIIMM)
-    MAKE_CASE(ARMISD::VGETLANEu)
-    MAKE_CASE(ARMISD::VGETLANEs)
-    MAKE_CASE(ARMISD::VMOVIMM)
-    MAKE_CASE(ARMISD::VMVNIMM)
-    MAKE_CASE(ARMISD::VMOVFPIMM)
-    MAKE_CASE(ARMISD::VDUP)
-    MAKE_CASE(ARMISD::VDUPLANE)
-    MAKE_CASE(ARMISD::VEXT)
-    MAKE_CASE(ARMISD::VREV64)
-    MAKE_CASE(ARMISD::VREV32)
-    MAKE_CASE(ARMISD::VREV16)
-    MAKE_CASE(ARMISD::VZIP)
-    MAKE_CASE(ARMISD::VUZP)
-    MAKE_CASE(ARMISD::VTRN)
-    MAKE_CASE(ARMISD::VTBL1)
-    MAKE_CASE(ARMISD::VTBL2)
-    MAKE_CASE(ARMISD::VMOVN)
-    MAKE_CASE(ARMISD::VQMOVNs)
-    MAKE_CASE(ARMISD::VQMOVNu)
-    MAKE_CASE(ARMISD::VCVTN)
-    MAKE_CASE(ARMISD::VCVTL)
-    MAKE_CASE(ARMISD::VIDUP)
-    MAKE_CASE(ARMISD::VMULLs)
-    MAKE_CASE(ARMISD::VMULLu)
-    MAKE_CASE(ARMISD::VQDMULH)
-    MAKE_CASE(ARMISD::VADDVs)
-    MAKE_CASE(ARMISD::VADDVu)
-    MAKE_CASE(ARMISD::VADDVps)
-    MAKE_CASE(ARMISD::VADDVpu)
-    MAKE_CASE(ARMISD::VADDLVs)
-    MAKE_CASE(ARMISD::VADDLVu)
-    MAKE_CASE(ARMISD::VADDLVAs)
-    MAKE_CASE(ARMISD::VADDLVAu)
-    MAKE_CASE(ARMISD::VADDLVps)
-    MAKE_CASE(ARMISD::VADDLVpu)
-    MAKE_CASE(ARMISD::VADDLVAps)
-    MAKE_CASE(ARMISD::VADDLVApu)
-    MAKE_CASE(ARMISD::VMLAVs)
-    MAKE_CASE(ARMISD::VMLAVu)
-    MAKE_CASE(ARMISD::VMLAVps)
-    MAKE_CASE(ARMISD::VMLAVpu)
-    MAKE_CASE(ARMISD::VMLALVs)
-    MAKE_CASE(ARMISD::VMLALVu)
-    MAKE_CASE(ARMISD::VMLALVps)
-    MAKE_CASE(ARMISD::VMLALVpu)
-    MAKE_CASE(ARMISD::VMLALVAs)
-    MAKE_CASE(ARMISD::VMLALVAu)
-    MAKE_CASE(ARMISD::VMLALVAps)
-    MAKE_CASE(ARMISD::VMLALVApu)
-    MAKE_CASE(ARMISD::VMINVu)
-    MAKE_CASE(ARMISD::VMINVs)
-    MAKE_CASE(ARMISD::VMAXVu)
-    MAKE_CASE(ARMISD::VMAXVs)
-    MAKE_CASE(ARMISD::UMAAL)
-    MAKE_CASE(ARMISD::UMLAL)
-    MAKE_CASE(ARMISD::SMLAL)
-    MAKE_CASE(ARMISD::SMLALBB)
-    MAKE_CASE(ARMISD::SMLALBT)
-    MAKE_CASE(ARMISD::SMLALTB)
-    MAKE_CASE(ARMISD::SMLALTT)
-    MAKE_CASE(ARMISD::SMULWB)
-    MAKE_CASE(ARMISD::SMULWT)
-    MAKE_CASE(ARMISD::SMLALD)
-    MAKE_CASE(ARMISD::SMLALDX)
-    MAKE_CASE(ARMISD::SMLSLD)
-    MAKE_CASE(ARMISD::SMLSLDX)
-    MAKE_CASE(ARMISD::SMMLAR)
-    MAKE_CASE(ARMISD::SMMLSR)
-    MAKE_CASE(ARMISD::QADD16b)
-    MAKE_CASE(ARMISD::QSUB16b)
-    MAKE_CASE(ARMISD::QADD8b)
-    MAKE_CASE(ARMISD::QSUB8b)
-    MAKE_CASE(ARMISD::UQADD16b)
-    MAKE_CASE(ARMISD::UQSUB16b)
-    MAKE_CASE(ARMISD::UQADD8b)
-    MAKE_CASE(ARMISD::UQSUB8b)
-    MAKE_CASE(ARMISD::BUILD_VECTOR)
-    MAKE_CASE(ARMISD::BFI)
-    MAKE_CASE(ARMISD::VORRIMM)
-    MAKE_CASE(ARMISD::VBICIMM)
-    MAKE_CASE(ARMISD::VBSP)
-    MAKE_CASE(ARMISD::MEMCPY)
-    MAKE_CASE(ARMISD::VLD1DUP)
-    MAKE_CASE(ARMISD::VLD2DUP)
-    MAKE_CASE(ARMISD::VLD3DUP)
-    MAKE_CASE(ARMISD::VLD4DUP)
-    MAKE_CASE(ARMISD::VLD1_UPD)
-    MAKE_CASE(ARMISD::VLD2_UPD)
-    MAKE_CASE(ARMISD::VLD3_UPD)
-    MAKE_CASE(ARMISD::VLD4_UPD)
-    MAKE_CASE(ARMISD::VLD1x2_UPD)
-    MAKE_CASE(ARMISD::VLD1x3_UPD)
-    MAKE_CASE(ARMISD::VLD1x4_UPD)
-    MAKE_CASE(ARMISD::VLD2LN_UPD)
-    MAKE_CASE(ARMISD::VLD3LN_UPD)
-    MAKE_CASE(ARMISD::VLD4LN_UPD)
-    MAKE_CASE(ARMISD::VLD1DUP_UPD)
-    MAKE_CASE(ARMISD::VLD2DUP_UPD)
-    MAKE_CASE(ARMISD::VLD3DUP_UPD)
-    MAKE_CASE(ARMISD::VLD4DUP_UPD)
-    MAKE_CASE(ARMISD::VST1_UPD)
-    MAKE_CASE(ARMISD::VST2_UPD)
-    MAKE_CASE(ARMISD::VST3_UPD)
-    MAKE_CASE(ARMISD::VST4_UPD)
-    MAKE_CASE(ARMISD::VST1x2_UPD)
-    MAKE_CASE(ARMISD::VST1x3_UPD)
-    MAKE_CASE(ARMISD::VST1x4_UPD)
-    MAKE_CASE(ARMISD::VST2LN_UPD)
-    MAKE_CASE(ARMISD::VST3LN_UPD)
-    MAKE_CASE(ARMISD::VST4LN_UPD)
-    MAKE_CASE(ARMISD::WLS)
-    MAKE_CASE(ARMISD::WLSSETUP)
-    MAKE_CASE(ARMISD::LE)
-    MAKE_CASE(ARMISD::LOOP_DEC)
-    MAKE_CASE(ARMISD::CSINV)
-    MAKE_CASE(ARMISD::CSNEG)
-    MAKE_CASE(ARMISD::CSINC)
-    MAKE_CASE(ARMISD::MEMCPYLOOP)
-    MAKE_CASE(ARMISD::MEMSETLOOP)
-#undef MAKE_CASE
+  case ARMISD::FIRST_NUMBER:  break;
+  case ARMISD::Wrapper:       return "ARMISD::Wrapper";
+  case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
+  case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
+  case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
+  case ARMISD::CALL:          return "ARMISD::CALL";
+  case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
+  case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
+  case ARMISD::BRCOND:        return "ARMISD::BRCOND";
+  case ARMISD::BR_JT:         return "ARMISD::BR_JT";
+  case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
+  case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
+  case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
+  case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
+  case ARMISD::CMP:           return "ARMISD::CMP";
+  case ARMISD::CMN:           return "ARMISD::CMN";
+  case ARMISD::CMPZ:          return "ARMISD::CMPZ";
+  case ARMISD::CMPFP:         return "ARMISD::CMPFP";
+  case ARMISD::CMPFPE:        return "ARMISD::CMPFPE";
+  case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
+  case ARMISD::CMPFPEw0:      return "ARMISD::CMPFPEw0";
+  case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
+  case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
+
+  case ARMISD::CMOV:          return "ARMISD::CMOV";
+  case ARMISD::SUBS:          return "ARMISD::SUBS";
+
+  case ARMISD::SSAT:          return "ARMISD::SSAT";
+  case ARMISD::USAT:          return "ARMISD::USAT";
+
+  case ARMISD::ASRL:          return "ARMISD::ASRL";
+  case ARMISD::LSRL:          return "ARMISD::LSRL";
+  case ARMISD::LSLL:          return "ARMISD::LSLL";
+
+  case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
+  case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
+  case ARMISD::RRX:           return "ARMISD::RRX";
+
+  case ARMISD::ADDC:          return "ARMISD::ADDC";
+  case ARMISD::ADDE:          return "ARMISD::ADDE";
+  case ARMISD::SUBC:          return "ARMISD::SUBC";
+  case ARMISD::SUBE:          return "ARMISD::SUBE";
+  case ARMISD::LSLS:          return "ARMISD::LSLS";
+
+  case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
+  case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
+  case ARMISD::VMOVhr:        return "ARMISD::VMOVhr";
+  case ARMISD::VMOVrh:        return "ARMISD::VMOVrh";
+  case ARMISD::VMOVSR:        return "ARMISD::VMOVSR";
+
+  case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
+  case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
+  case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
+
+  case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
+
+  case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
+
+  case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
+
+  case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
+
+  case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
+
+  case ARMISD::WIN__CHKSTK:   return "ARMISD::WIN__CHKSTK";
+  case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
+
+  case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST";
+  case ARMISD::VCMP:          return "ARMISD::VCMP";
+  case ARMISD::VCMPZ:         return "ARMISD::VCMPZ";
+  case ARMISD::VTST:          return "ARMISD::VTST";
+
+  case ARMISD::VSHLs:         return "ARMISD::VSHLs";
+  case ARMISD::VSHLu:         return "ARMISD::VSHLu";
+  case ARMISD::VSHLIMM:       return "ARMISD::VSHLIMM";
+  case ARMISD::VSHRsIMM:      return "ARMISD::VSHRsIMM";
+  case ARMISD::VSHRuIMM:      return "ARMISD::VSHRuIMM";
+  case ARMISD::VRSHRsIMM:     return "ARMISD::VRSHRsIMM";
+  case ARMISD::VRSHRuIMM:     return "ARMISD::VRSHRuIMM";
+  case ARMISD::VRSHRNIMM:     return "ARMISD::VRSHRNIMM";
+  case ARMISD::VQSHLsIMM:     return "ARMISD::VQSHLsIMM";
+  case ARMISD::VQSHLuIMM:     return "ARMISD::VQSHLuIMM";
+  case ARMISD::VQSHLsuIMM:    return "ARMISD::VQSHLsuIMM";
+  case ARMISD::VQSHRNsIMM:    return "ARMISD::VQSHRNsIMM";
+  case ARMISD::VQSHRNuIMM:    return "ARMISD::VQSHRNuIMM";
+  case ARMISD::VQSHRNsuIMM:   return "ARMISD::VQSHRNsuIMM";
+  case ARMISD::VQRSHRNsIMM:   return "ARMISD::VQRSHRNsIMM";
+  case ARMISD::VQRSHRNuIMM:   return "ARMISD::VQRSHRNuIMM";
+  case ARMISD::VQRSHRNsuIMM:  return "ARMISD::VQRSHRNsuIMM";
+  case ARMISD::VSLIIMM:       return "ARMISD::VSLIIMM";
+  case ARMISD::VSRIIMM:       return "ARMISD::VSRIIMM";
+  case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
+  case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
+  case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
+  case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
+  case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
+  case ARMISD::VDUP:          return "ARMISD::VDUP";
+  case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
+  case ARMISD::VEXT:          return "ARMISD::VEXT";
+  case ARMISD::VREV64:        return "ARMISD::VREV64";
+  case ARMISD::VREV32:        return "ARMISD::VREV32";
+  case ARMISD::VREV16:        return "ARMISD::VREV16";
+  case ARMISD::VZIP:          return "ARMISD::VZIP";
+  case ARMISD::VUZP:          return "ARMISD::VUZP";
+  case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::VTBL1:         return "ARMISD::VTBL1";
+  case ARMISD::VTBL2:         return "ARMISD::VTBL2";
+  case ARMISD::VMOVN:         return "ARMISD::VMOVN";
+  case ARMISD::VMULLs:        return "ARMISD::VMULLs";
+  case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::UMAAL:         return "ARMISD::UMAAL";
+  case ARMISD::UMLAL:         return "ARMISD::UMLAL";
+  case ARMISD::SMLAL:         return "ARMISD::SMLAL";
+  case ARMISD::SMLALBB:       return "ARMISD::SMLALBB";
+  case ARMISD::SMLALBT:       return "ARMISD::SMLALBT";
+  case ARMISD::SMLALTB:       return "ARMISD::SMLALTB";
+  case ARMISD::SMLALTT:       return "ARMISD::SMLALTT";
+  case ARMISD::SMULWB:        return "ARMISD::SMULWB";
+  case ARMISD::SMULWT:        return "ARMISD::SMULWT";
+  case ARMISD::SMLALD:        return "ARMISD::SMLALD";
+  case ARMISD::SMLALDX:       return "ARMISD::SMLALDX";
+  case ARMISD::SMLSLD:        return "ARMISD::SMLSLD";
+  case ARMISD::SMLSLDX:       return "ARMISD::SMLSLDX";
+  case ARMISD::SMMLAR:        return "ARMISD::SMMLAR";
+  case ARMISD::SMMLSR:        return "ARMISD::SMMLSR";
+  case ARMISD::QADD16b:       return "ARMISD::QADD16b";
+  case ARMISD::QSUB16b:       return "ARMISD::QSUB16b";
+  case ARMISD::QADD8b:        return "ARMISD::QADD8b";
+  case ARMISD::QSUB8b:        return "ARMISD::QSUB8b";
+  case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
+  case ARMISD::BFI:           return "ARMISD::BFI";
+  case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
+  case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
+  case ARMISD::VBSL:          return "ARMISD::VBSL";
+  case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
+  case ARMISD::VLD1DUP:       return "ARMISD::VLD1DUP";
+  case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
+  case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
+  case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
+  case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
+  case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
+  case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
+  case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
+  case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
+  case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
+  case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
+  case ARMISD::VLD1DUP_UPD:   return "ARMISD::VLD1DUP_UPD";
+  case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
+  case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
+  case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
+  case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
+  case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
+  case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
+  case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
+  case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
+  case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
+  case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
+  case ARMISD::WLS:           return "ARMISD::WLS";
+  case ARMISD::LE:            return "ARMISD::LE";
+  case ARMISD::LOOP_DEC:      return "ARMISD::LOOP_DEC";
+  case ARMISD::CSINV:         return "ARMISD::CSINV";
+  case ARMISD::CSNEG:         return "ARMISD::CSNEG";
+  case ARMISD::CSINC:         return "ARMISD::CSINC";
   }
   return nullptr;
 }
@@ -1884,11 +1715,8 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
     return getPointerTy(DL);
 
   // MVE has a predicate register.
-  if ((Subtarget->hasMVEIntegerOps() &&
-       (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
-        VT == MVT::v16i8)) ||
-      (Subtarget->hasMVEFloatOps() &&
-       (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
+  if (Subtarget->hasMVEIntegerOps() &&
+      (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8))
     return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
   return VT.changeVectorElementTypeToInteger();
 }
@@ -1902,18 +1730,12 @@ ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
   // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
   // MVE Q registers.
-  if (Subtarget->hasNEON()) {
+  if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
     if (VT == MVT::v4i64)
       return &ARM::QQPRRegClass;
     if (VT == MVT::v8i64)
       return &ARM::QQQQPRRegClass;
   }
-  if (Subtarget->hasMVEIntegerOps()) {
-    if (VT == MVT::v4i64)
-      return &ARM::MQQPRRegClass;
-    if (VT == MVT::v8i64)
-      return &ARM::MQQQQPRRegClass;
-  }
   return TargetLowering::getRegClassFor(VT);
 }
 
@@ -1921,14 +1743,13 @@ ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
 // source/dest is aligned and the copy size is large enough. We therefore want
 // to align such objects passed to memory intrinsics.
 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
-                                               Align &PrefAlign) const {
+                                               unsigned &PrefAlign) const {
   if (!isa<MemIntrinsic>(CI))
     return false;
   MinSize = 8;
   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
   // cycle faster than 4-byte aligned LDM.
-  PrefAlign =
-      (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
+  PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
   return true;
 }
 
@@ -2075,10 +1896,8 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
     return CallingConv::PreserveMost;
   case CallingConv::ARM_AAPCS_VFP:
   case CallingConv::Swift:
-  case CallingConv::SwiftTail:
     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
   case CallingConv::C:
-  case CallingConv::Tail:
     if (!Subtarget->isAAPCS_ABI())
       return CallingConv::ARM_APCS;
     else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
@@ -2136,35 +1955,6 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
   }
 }
 
-SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
-                                     MVT LocVT, MVT ValVT, SDValue Val) const {
-  Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
-                    Val);
-  if (Subtarget->hasFullFP16()) {
-    Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
-  } else {
-    Val = DAG.getNode(ISD::TRUNCATE, dl,
-                      MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
-    Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
-  }
-  return Val;
-}
-
-SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
-                                       MVT LocVT, MVT ValVT,
-                                       SDValue Val) const {
-  if (Subtarget->hasFullFP16()) {
-    Val = DAG.getNode(ARMISD::VMOVrh, dl,
-                      MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
-  } else {
-    Val = DAG.getNode(ISD::BITCAST, dl,
-                      MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
-    Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
-                      MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
-  }
-  return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
-}
-
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 SDValue ARMTargetLowering::LowerCallResult(
@@ -2192,8 +1982,7 @@ SDValue ARMTargetLowering::LowerCallResult(
     }
 
     SDValue Val;
-    if (VA.needsCustom() &&
-        (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
+    if (VA.needsCustom()) {
       // Handle f64 or half of a v2f64.
       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
                                       InFlag);
@@ -2242,44 +2031,25 @@ SDValue ARMTargetLowering::LowerCallResult(
       break;
     }
 
-    // f16 arguments have their size extended to 4 bytes and passed as if they
-    // had been copied to the LSBs of a 32-bit register.
-    // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
-    if (VA.needsCustom() &&
-        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
-      Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
-
     InVals.push_back(Val);
   }
 
   return Chain;
 }
 
-std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
-    const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
-    bool IsTailCall, int SPDiff) const {
-  SDValue DstAddr;
-  MachinePointerInfo DstInfo;
-  int32_t Offset = VA.getLocMemOffset();
-  MachineFunction &MF = DAG.getMachineFunction();
-
-  if (IsTailCall) {
-        Offset += SPDiff;
-        auto PtrVT = getPointerTy(DAG.getDataLayout());
-        int Size = VA.getLocVT().getFixedSizeInBits() / 8;
-        int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
-        DstAddr = DAG.getFrameIndex(FI, PtrVT);
-        DstInfo =
-            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
-  } else {
-        SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
-        DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
-                              StackPtr, PtrOff);
-        DstInfo =
-            MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset);
-  }
-
-  return std::make_pair(DstAddr, DstInfo);
+/// LowerMemOpCallTo - Store the argument to the stack.
+SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+                                            SDValue Arg, const SDLoc &dl,
+                                            SelectionDAG &DAG,
+                                            const CCValAssign &VA,
+                                            ISD::ArgFlagsTy Flags) const {
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                       StackPtr, PtrOff);
+  return DAG.getStore(
+      Chain, dl, Arg, PtrOff,
+      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
 }
 
 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
@@ -2288,8 +2058,7 @@ void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
                                          CCValAssign &VA, CCValAssign &NextVA,
                                          SDValue &StackPtr,
                                          SmallVectorImpl<SDValue> &MemOpChains,
-                                         bool IsTailCall,
-                                         int SPDiff) const {
+                                         ISD::ArgFlagsTy Flags) const {
   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
   unsigned id = Subtarget->isLittle() ? 0 : 1;
@@ -2303,20 +2072,12 @@ void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
                                     getPointerTy(DAG.getDataLayout()));
 
-    SDValue DstAddr;
-    MachinePointerInfo DstInfo;
-    std::tie(DstAddr, DstInfo) =
-        computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
-    MemOpChains.push_back(
-        DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
+    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
+                                           dl, DAG, NextVA,
+                                           Flags));
   }
 }
 
-static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
-  return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
-         CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
-}
-
 /// LowerCall - Lowering a call into a callseq_start <-
 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
 /// nodes.
@@ -2336,41 +2097,22 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isVarArg                         = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   MachineFunction::CallSiteInfo CSInfo;
   bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool isThisReturn = false;
-  bool isCmseNSCall   = false;
-  bool isSibCall = false;
   bool PreferIndirect = false;
-  bool GuardWithBTI = false;
-
-  // Lower 'returns_twice' calls to a pseudo-instruction.
-  if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
-      !Subtarget->noBTIAtReturnTwice())
-    GuardWithBTI = AFI->branchTargetEnforcement();
-
-  // Determine whether this is a non-secure function call.
-  if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
-    isCmseNSCall = true;
 
   // Disable tail calls if they're not supported.
   if (!Subtarget->supportsTailCall())
     isTailCall = false;
 
-  // For both the non-secure calls and the returns from a CMSE entry function,
-  // the function needs to do some extra work afte r the call, or before the
-  // return, respectively, thus it cannot end with atail call
-  if (isCmseNSCall || AFI->isCmseNSEntryFunction())
-    isTailCall = false;
-
   if (isa<GlobalAddressSDNode>(Callee)) {
     // If we're optimizing for minimum size and the function is called three or
     // more times in this block, we can improve codesize by calling indirectly
     // as BLXr has a 16-bit encoding.
     auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
-    if (CLI.CB) {
-      auto *BB = CLI.CB->getParent();
+    if (CLI.CS) {
+      auto *BB = CLI.CS.getParent();
       PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
                        count_if(GV->users(), [&BB](const User *U) {
                          return isa<Instruction>(U) &&
@@ -2384,20 +2126,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         Callee, CallConv, isVarArg, isStructRet,
         MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
         PreferIndirect);
-
-    if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
-        CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
-      isSibCall = true;
-
+    if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall())
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
     // We don't support GuaranteedTailCallOpt for ARM, only automatically
     // detected sibcalls.
     if (isTailCall)
       ++NumTailCalls;
   }
 
-  if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
-    report_fatal_error("failed to perform tail call elimination on a call "
-                       "site marked musttail");
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
@@ -2407,40 +2144,13 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
-  // SPDiff is the byte offset of the call's argument area from the callee's.
-  // Stores to callee stack arguments will be placed in FixedStackSlots offset
-  // by this amount for a tail call. In a sibling call it must be 0 because the
-  // caller will deallocate the entire stack and the callee still expects its
-  // arguments to begin at SP+0. Completely unused for non-tail calls.
-  int SPDiff = 0;
-
-  if (isTailCall && !isSibCall) {
-    auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
-    unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
-
-    // Since callee will pop argument stack as a tail call, we must keep the
-    // popped size 16-byte aligned.
-    Align StackAlign = DAG.getDataLayout().getStackAlignment();
-    NumBytes = alignTo(NumBytes, StackAlign);
-
-    // SPDiff will be negative if this tail call requires more space than we
-    // would automatically have in our incoming argument space. Positive if we
-    // can actually shrink the stack.
-    SPDiff = NumReusableBytes - NumBytes;
-
-    // If this call requires more stack than we have available from
-    // LowerFormalArguments, tell FrameLowering to reserve space for it.
-    if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
-      AFI->setArgRegsSaveSize(-SPDiff);
-  }
-
-  if (isSibCall) {
-    // For sibling tail calls, memory operands are available in our caller's stack.
+  if (isTailCall) {
+    // For tail calls, memory operands are available in our caller's stack.
     NumBytes = 0;
   } else {
     // Adjust the stack pointer for the new arguments...
     // These operations are automatically eliminated by the prolog/epilog pass
-    Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   }
 
   SDValue StackPtr =
@@ -2449,13 +2159,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   RegsToPassVector RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
-  // During a tail call, stores to the argument area must happen after all of
-  // the function's incoming arguments have been loaded because they may alias.
-  // This is done by folding in a TokenFactor from LowerFormalArguments, but
-  // there's no point in doing so repeatedly so this tracks whether that's
-  // happened yet.
-  bool AfterFormalArgLoads = false;
-
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization, arguments are handled later.
   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
@@ -2484,57 +2187,31 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       break;
     }
 
-    if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
-      Chain = DAG.getStackArgumentTokenFactor(Chain);
-      AfterFormalArgLoads = true;
-    }
+    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
+    if (VA.needsCustom()) {
+      if (VA.getLocVT() == MVT::v2f64) {
+        SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(0, dl, MVT::i32));
+        SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(1, dl, MVT::i32));
 
-    // f16 arguments have their size extended to 4 bytes and passed as if they
-    // had been copied to the LSBs of a 32-bit register.
-    // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
-    if (VA.needsCustom() &&
-        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
-      Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
-    } else {
-      // f16 arguments could have been extended prior to argument lowering.
-      // Mask them arguments if this is a CMSE nonsecure call.
-      auto ArgVT = Outs[realArgIdx].ArgVT;
-      if (isCmseNSCall && (ArgVT == MVT::f16)) {
-        auto LocBits = VA.getLocVT().getSizeInBits();
-        auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
-        SDValue Mask =
-            DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
-        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
-        Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
-        Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
-      }
-    }
+        PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
+                         VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
 
-    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
-    if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
-      SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
-                                DAG.getConstant(0, dl, MVT::i32));
-      SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
-                                DAG.getConstant(1, dl, MVT::i32));
-
-      PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
-                       StackPtr, MemOpChains, isTailCall, SPDiff);
-
-      VA = ArgLocs[++i]; // skip ahead to next loc
-      if (VA.isRegLoc()) {
-        PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
-                         StackPtr, MemOpChains, isTailCall, SPDiff);
+        VA = ArgLocs[++i]; // skip ahead to next loc
+        if (VA.isRegLoc()) {
+          PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
+                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+        } else {
+          assert(VA.isMemLoc());
+
+          MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
+                                                 dl, DAG, VA, Flags));
+        }
       } else {
-        assert(VA.isMemLoc());
-        SDValue DstAddr;
-        MachinePointerInfo DstInfo;
-        std::tie(DstAddr, DstInfo) =
-            computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
+        PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+                         StackPtr, MemOpChains, Flags);
       }
-    } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
-      PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
-                       StackPtr, MemOpChains, isTailCall, SPDiff);
     } else if (VA.isRegLoc()) {
       if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
           Outs[0].VT == MVT::i32) {
@@ -2545,7 +2222,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         isThisReturn = true;
       }
       const TargetOptions &Options = DAG.getTarget().Options;
-      if (Options.EmitCallSiteInfo)
+      if (Options.EnableDebugEntryValues)
         CSInfo.emplace_back(VA.getLocReg(), i);
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else if (isByVal) {
@@ -2568,9 +2245,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
-          SDValue Load =
-              DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
-                          DAG.InferPtrAlign(AddArg));
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     DAG.InferPtrAlignment(AddArg));
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(j, Load));
         }
@@ -2584,31 +2261,26 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
       if (Flags.getByValSize() > 4*offset) {
         auto PtrVT = getPointerTy(DAG.getDataLayout());
-        SDValue Dst;
-        MachinePointerInfo DstInfo;
-        std::tie(Dst, DstInfo) =
-            computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
+        unsigned LocMemOffset = VA.getLocMemOffset();
+        SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+        SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
                                            MVT::i32);
-        SDValue AlignNode =
-            DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
+        SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
+                                            MVT::i32);
 
         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
                                           Ops));
       }
-    } else {
+    } else if (!isTailCall) {
       assert(VA.isMemLoc());
-      SDValue DstAddr;
-      MachinePointerInfo DstInfo;
-      std::tie(DstAddr, DstInfo) =
-          computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
 
-      SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
-      MemOpChains.push_back(Store);
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
     }
   }
 
@@ -2631,14 +2303,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   const TargetMachine &TM = getTargetMachine();
   const Module *Mod = MF.getFunction().getParent();
-  const GlobalValue *GVal = nullptr;
+  const GlobalValue *GV = nullptr;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    GVal = G->getGlobal();
+    GV = G->getGlobal();
   bool isStub =
-      !TM.shouldAssumeDSOLocal(*Mod, GVal) && Subtarget->isTargetMachO();
+      !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
 
   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
   bool isLocalARMFunc = false;
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   auto PtrVt = getPointerTy(DAG.getDataLayout());
 
   if (Subtarget->genLongCalls()) {
@@ -2648,58 +2321,36 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // those, the target's already in a register, so we don't need to do
     // anything extra.
     if (isa<GlobalAddressSDNode>(Callee)) {
-      // When generating execute-only code we use movw movt pair.
-      // Currently execute-only is only available for architectures that
-      // support movw movt, so we are safe to assume that.
-      if (Subtarget->genExecuteOnly()) {
-        assert(Subtarget->useMovt() &&
-               "long-calls with execute-only requires movt and movw!");
-        ++NumMovwMovt;
-        Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
-                             DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
-      } else {
-        // Create a constant pool entry for the callee address
-        unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-        ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
-            GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
-
-        // Get the address of the callee into a register
-        SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
-        Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
-        Callee = DAG.getLoad(
-            PtrVt, dl, DAG.getEntryNode(), Addr,
-            MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
-      }
+      // Create a constant pool entry for the callee address
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
+
+      // Get the address of the callee into a register
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(
+          PtrVt, dl, DAG.getEntryNode(), CPAddr,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
       const char *Sym = S->getSymbol();
 
-      // When generating execute-only code we use movw movt pair.
-      // Currently execute-only is only available for architectures that
-      // support movw movt, so we are safe to assume that.
-      if (Subtarget->genExecuteOnly()) {
-        assert(Subtarget->useMovt() &&
-               "long-calls with execute-only requires movt and movw!");
-        ++NumMovwMovt;
-        Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
-                             DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
-      } else {
-        // Create a constant pool entry for the callee address
-        unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-        ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
-            *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
-
-        // Get the address of the callee into a register
-        SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
-        Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
-        Callee = DAG.getLoad(
-            PtrVt, dl, DAG.getEntryNode(), Addr,
-            MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
-      }
+      // Create a constant pool entry for the callee address
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+                                      ARMPCLabelIndex, 0);
+      // Get the address of the callee into a register
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(
+          PtrVt, dl, DAG.getEntryNode(), CPAddr,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     }
   } else if (isa<GlobalAddressSDNode>(Callee)) {
     if (!PreferIndirect) {
       isDirect = true;
-      bool isDef = GVal->isStrongDefinitionForLinker();
+      bool isDef = GV->isStrongDefinitionForLinker();
 
       // ARM call to a local ARM function is predicable.
       isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
@@ -2708,21 +2359,21 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
         Callee = DAG.getNode(
             ARMISD::WrapperPIC, dl, PtrVt,
-            DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
+            DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
         Callee = DAG.getLoad(
             PtrVt, dl, DAG.getEntryNode(), Callee,
-            MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(),
-            MachineMemOperand::MODereferenceable |
-                MachineMemOperand::MOInvariant);
+            MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+            /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
+                                     MachineMemOperand::MOInvariant);
       } else if (Subtarget->isTargetCOFF()) {
         assert(Subtarget->isTargetWindows() &&
                "Windows is the only supported COFF target");
         unsigned TargetFlags = ARMII::MO_NO_FLAG;
-        if (GVal->hasDLLImportStorageClass())
+        if (GV->hasDLLImportStorageClass())
           TargetFlags = ARMII::MO_DLLIMPORT;
-        else if (!TM.shouldAssumeDSOLocal(*GVal->getParent(), GVal))
+        else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
           TargetFlags = ARMII::MO_COFFSTUB;
-        Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
+        Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
                                             TargetFlags);
         if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
           Callee =
@@ -2730,7 +2381,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                           DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
                           MachinePointerInfo::getGOT(DAG.getMachineFunction()));
       } else {
-        Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
+        Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
       }
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
@@ -2742,7 +2393,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       ARMConstantPoolValue *CPV =
         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
                                       ARMPCLabelIndex, 4);
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
@@ -2754,33 +2405,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
-  if (isCmseNSCall) {
-    assert(!isARMFunc && !isDirect &&
-           "Cannot handle call to ARM function or direct call");
-    if (NumBytes > 0) {
-      DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(),
-                                     "call to non-secure function would "
-                                     "require passing arguments on stack",
-                                     dl.getDebugLoc());
-      DAG.getContext()->diagnose(Diag);
-    }
-    if (isStructRet) {
-      DiagnosticInfoUnsupported Diag(
-          DAG.getMachineFunction().getFunction(),
-          "call to non-secure function would return value through pointer",
-          dl.getDebugLoc());
-      DAG.getContext()->diagnose(Diag);
-    }
-  }
-
   // FIXME: handle tail calls differently.
   unsigned CallOpc;
   if (Subtarget->isThumb()) {
-    if (GuardWithBTI)
-      CallOpc = ARMISD::t2CALL_BTI;
-    else if (isCmseNSCall)
-      CallOpc = ARMISD::tSECALL;
-    else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
+    if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
     else
       CallOpc = ARMISD::CALL;
@@ -2796,23 +2424,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
   }
 
-  // We don't usually want to end the call-sequence here because we would tidy
-  // the frame up *after* the call, however in the ABI-changing tail-call case
-  // we've carefully laid out the parameters so that when sp is reset they'll be
-  // in the correct location.
-  if (isTailCall && !isSibCall) {
-    Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, dl);
-    InFlag = Chain.getValue(1);
-  }
-
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);
 
-  if (isTailCall) {
-    Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
-  }
-
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
@@ -2820,23 +2435,25 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const uint32_t *Mask;
-  const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
-  if (isThisReturn) {
-    // For 'this' returns, use the R0-preserving mask if applicable
-    Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
-    if (!Mask) {
-      // Set isThisReturn to false if the calling convention is not one that
-      // allows 'returned' to be modeled in this way, so LowerCallResult does
-      // not try to pass 'this' straight through
-      isThisReturn = false;
+  if (!isTailCall) {
+    const uint32_t *Mask;
+    const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
+    if (isThisReturn) {
+      // For 'this' returns, use the R0-preserving mask if applicable
+      Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
+      if (!Mask) {
+        // Set isThisReturn to false if the calling convention is not one that
+        // allows 'returned' to be modeled in this way, so LowerCallResult does
+        // not try to pass 'this' straight through
+        isThisReturn = false;
+        Mask = ARI->getCallPreservedMask(MF, CallConv);
+      }
+    } else
       Mask = ARI->getCallPreservedMask(MF, CallConv);
-    }
-  } else
-    Mask = ARI->getCallPreservedMask(MF, CallConv);
 
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
@@ -2851,18 +2468,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
-  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   InFlag = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
 
-  // If we're guaranteeing tail-calls will be honoured, the callee must
-  // pop its own argument stack on return. But this call is *not* a tail call so
-  // we need to undo that after it returns to restore the status-quo.
-  bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
-  uint64_t CalleePopBytes =
-      canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
-
-  Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, dl);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
@@ -2878,15 +2488,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 /// and then confiscate the rest of the parameter registers to insure
 /// this.
 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
-                                    Align Alignment) const {
+                                    unsigned Align) const {
   // Byval (as with any stack) slots are always at least 4 byte aligned.
-  Alignment = std::max(Alignment, Align(4));
+  Align = std::max(Align, 4U);
 
   unsigned Reg = State->AllocateReg(GPRArgRegs);
   if (!Reg)
     return;
 
-  unsigned AlignInRegs = Alignment.value() / 4;
+  unsigned AlignInRegs = Align / 4;
   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
   for (unsigned i = 0; i < Waste; ++i)
     Reg = State->AllocateReg(GPRArgRegs);
@@ -2937,8 +2547,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   unsigned Bytes = Arg.getValueSizeInBits() / 8;
   int FI = std::numeric_limits<int>::max();
   if (Arg.getOpcode() == ISD::CopyFromReg) {
-    Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
-    if (!VR.isVirtual())
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!Register::isVirtualRegister(VR))
       return false;
     MachineInstr *Def = MRI->getVRegDef(VR);
     if (!Def)
@@ -2990,17 +2600,9 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
   // Indirect tail calls cannot be optimized for Thumb1 if the args
   // to the call take up r0-r3. The reason is that there are no legal registers
   // left to hold the pointer to the function to be called.
-  // Similarly, if the function uses return address sign and authentication,
-  // r12 is needed to hold the PAC and is not available to hold the callee
-  // address.
-  if (Outs.size() >= 4 &&
-      (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
-    if (Subtarget->isThumb1Only())
-      return false;
-    // Conservatively assume the function spills LR.
-    if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true))
-      return false;
-  }
+  if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
+      (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect))
+    return false;
 
   // Look for obvious safe cases to perform tail call optimization that do not
   // require ABI changes. This is what gcc calls sibcall.
@@ -3011,9 +2613,6 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
   if (CallerF.hasFnAttribute("interrupt"))
     return false;
 
-  if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
-    return CalleeCC == CallerCC;
-
   // Also avoid sibcall optimization if either caller or callee uses struct
   // return semantics.
   if (isCalleeStructRet || isCallerStructRet)
@@ -3036,11 +2635,9 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
 
   // Check that the call results are passed in the same way.
   LLVMContext &C = *DAG.getContext();
-  if (!CCState::resultsCompatible(
-          getEffectiveCallingConv(CalleeCC, isVarArg),
-          getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
-          CCAssignFnForReturn(CalleeCC, isVarArg),
-          CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
+  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+                                  CCAssignFnForReturn(CalleeCC, isVarArg),
+                                  CCAssignFnForReturn(CallerCC, isVarArg)))
     return false;
   // The callee has to preserve all registers the caller needs to preserve.
   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
@@ -3081,7 +2678,7 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
         if (VA.getLocInfo() == CCValAssign::Indirect)
           return false;
-        if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
+        if (VA.needsCustom()) {
           // f64 and vector types are split into multiple registers or
           // register/stack-slot combinations.  The types will not match
           // the registers; give up on memory f64 refs until we figure
@@ -3180,17 +2777,6 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   AFI->setReturnRegsCount(RVLocs.size());
 
- // Report error if cmse entry function returns structure through first ptr arg.
-  if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
-    // Note: using an empty SDLoc(), as the first line of the function is a
-    // better place to report than the last line.
-    DiagnosticInfoUnsupported Diag(
-        DAG.getMachineFunction().getFunction(),
-        "secure entry function would return value through pointer",
-        SDLoc().getDebugLoc());
-    DAG.getContext()->diagnose(Diag);
-  }
-
   // Copy the result values into the output registers.
   for (unsigned i = 0, realRVLocIdx = 0;
        i != RVLocs.size();
@@ -3233,24 +2819,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       break;
     }
 
-    // Mask f16 arguments if this is a CMSE nonsecure entry.
-    auto RetVT = Outs[realRVLocIdx].ArgVT;
-    if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
-      if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
-        Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
-      } else {
-        auto LocBits = VA.getLocVT().getSizeInBits();
-        auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
-        SDValue Mask =
-            DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
-        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
-        Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
-        Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
-      }
-    }
-
-    if (VA.needsCustom() &&
-        (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
+    if (VA.needsCustom()) {
       if (VA.getLocVT() == MVT::v2f64) {
         // Extract the first half and return it in two registers.
         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
@@ -3258,15 +2827,15 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
 
-        Chain =
-            DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag);
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                                 HalfGPRs.getValue(isLittleEndian ? 0 : 1),
+                                 Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
-        Chain =
-            DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag);
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                                 HalfGPRs.getValue(isLittleEndian ? 1 : 0),
+                                 Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
@@ -3280,20 +2849,22 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                               fmrrd.getValue(isLittleEndian ? 0 : 1), Flag);
+                               fmrrd.getValue(isLittleEndian ? 0 : 1),
+                               Flag);
       Flag = Chain.getValue(1);
       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
       VA = RVLocs[++i]; // skip ahead to next loc
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                               fmrrd.getValue(isLittleEndian ? 1 : 0), Flag);
+                               fmrrd.getValue(isLittleEndian ? 1 : 0),
+                               Flag);
     } else
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
 
     // Guarantee that all emitted copies are
     // stuck together, avoiding something bad.
     Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(
-        VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(),
+                                     ReturnF16 ? MVT::f16 : VA.getLocVT()));
   }
   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const MCPhysReg *I =
@@ -3327,9 +2898,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     return LowerInterruptReturn(RetOps, dl, DAG);
   }
 
-  ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG :
-                                                            ARMISD::RET_FLAG;
-  return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
+  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -3350,24 +2919,26 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
     SDNode *VMov = Copy;
     // f64 returned in a pair of GPRs.
     SmallPtrSet<SDNode*, 2> Copies;
-    for (SDNode *U : VMov->uses()) {
-      if (U->getOpcode() != ISD::CopyToReg)
+    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
+         UI != UE; ++UI) {
+      if (UI->getOpcode() != ISD::CopyToReg)
         return false;
-      Copies.insert(U);
+      Copies.insert(*UI);
     }
     if (Copies.size() > 2)
       return false;
 
-    for (SDNode *U : VMov->uses()) {
-      SDValue UseChain = U->getOperand(0);
+    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
+         UI != UE; ++UI) {
+      SDValue UseChain = UI->getOperand(0);
       if (Copies.count(UseChain.getNode()))
         // Second CopyToReg
-        Copy = U;
+        Copy = *UI;
       else {
         // We are at the top of this chain.
         // If the copy has a glue operand, we conservatively assume it
         // isn't safe to perform a tail call.
-        if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
+        if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
           return false;
         // First CopyToReg
         TCChain = UseChain;
@@ -3390,9 +2961,10 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   }
 
   bool HasRet = false;
-  for (const SDNode *U : Copy->uses()) {
-    if (U->getOpcode() != ARMISD::RET_FLAG &&
-        U->getOpcode() != ARMISD::INTRET_FLAG)
+  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() != ARMISD::RET_FLAG &&
+        UI->getOpcode() != ARMISD::INTRET_FLAG)
       return false;
     HasRet = true;
   }
@@ -3467,16 +3039,12 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
     return LowerGlobalAddress(GA, DAG);
   }
 
-  // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
-  // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
-  Align CPAlign = CP->getAlign();
-  if (Subtarget->isThumb1Only())
-    CPAlign = std::max(CPAlign, Align(4));
   if (CP->isMachineConstantPoolEntry())
-    Res =
-        DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment());
   else
-    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment());
   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
 }
 
@@ -3495,14 +3063,14 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   SDValue CPAddr;
   bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
   if (!IsPositionIndependent) {
-    CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
+    CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
   } else {
     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
     ARMPCLabelIndex = AFI->createPICLabelUId();
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
                                       ARMCP::CPBlockAddress, PCAdj);
-    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
   SDValue Result = DAG.getLoad(
@@ -3554,7 +3122,8 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
   SDValue Chain = DAG.getEntryNode();
   SDValue FuncTLVGet = DAG.getLoad(
       MVT::i32, DL, Chain, DescAddr,
-      MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4),
+      MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+      /* Alignment = */ 4,
       MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
           MachineMemOperand::MOInvariant);
   Chain = FuncTLVGet.getValue(1);
@@ -3630,9 +3199,8 @@ ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
   const auto *GA = cast<GlobalAddressSDNode>(Op);
   auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
   SDValue Offset = DAG.getLoad(
-      PtrVT, DL, Chain,
-      DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
-                  DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
+      PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
+                                    DAG.getTargetConstantPool(CPV, PtrVT, 4)),
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
 
   return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
@@ -3651,7 +3219,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   ARMConstantPoolValue *CPV =
     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
-  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
+  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
   Argument = DAG.getLoad(
       PtrVT, dl, DAG.getEntryNode(), Argument,
@@ -3702,7 +3270,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
                                       true);
-    Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
@@ -3720,7 +3288,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
     assert(model == TLSModel::LocalExec);
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
-    Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
@@ -3762,11 +3330,14 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 /// Return true if all users of V are within function F, looking through
 /// ConstantExprs.
 static bool allUsersAreInFunction(const Value *V, const Function *F) {
-  SmallVector<const User*,4> Worklist(V->users());
+  SmallVector<const User*,4> Worklist;
+  for (auto *U : V->users())
+    Worklist.push_back(U);
   while (!Worklist.empty()) {
     auto *U = Worklist.pop_back_val();
     if (isa<ConstantExpr>(U)) {
-      append_range(Worklist, U->users());
+      for (auto *UU : U->users())
+        Worklist.push_back(UU);
       continue;
     }
 
@@ -3809,7 +3380,7 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
   // from .data to .text. This is not allowed in position-independent code.
   auto *Init = GVar->getInitializer();
   if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
-      Init->needsDynamicRelocation())
+      Init->needsRelocation())
     return SDValue();
 
   // The constant islands pass can only really deal with alignment requests
@@ -3820,11 +3391,11 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
   // that are strings for simplicity.
   auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
   unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
-  Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
+  unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar);
   unsigned RequiredPadding = 4 - (Size % 4);
   bool PaddingPossible =
     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
-  if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
+  if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize ||
       Size == 0)
     return SDValue();
 
@@ -3863,7 +3434,8 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
   }
 
   auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
-  SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
+  SDValue CPAddr =
+    DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
     AFI->markGlobalAsPromotedToConstantPool(GVar);
     AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
@@ -3875,7 +3447,7 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
 
 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    if (!(GV = GA->getAliaseeObject()))
+    if (!(GV = GA->getBaseObject()))
       return false;
   if (const auto *V = dyn_cast<GlobalVariable>(GV))
     return V->isConstant();
@@ -3933,7 +3505,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     } else { // use literal pool for address constant
       ARMConstantPoolValue *CPV =
         ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       RelAddr = DAG.getLoad(
           PtrVT, dl, DAG.getEntryNode(), CPAddr,
@@ -3953,7 +3525,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
   } else {
-    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
+    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     return DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
@@ -4061,10 +3633,10 @@ SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
           ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
       assert(Mask && "Missing call preserved mask for calling convention");
       // Mark LR an implicit live-in.
-      Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+      unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
       SDValue ReturnAddress =
           DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
-      constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
+      std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue};
       SDValue Callee =
           DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
       SDValue RegisterMask = DAG.getRegisterMask(Mask);
@@ -4148,7 +3720,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
                                       ARMCP::CPLSDA, PCAdj);
-    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result = DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
@@ -4210,15 +3782,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   case Intrinsic::arm_mve_pred_v2i:
     return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1));
-  case Intrinsic::arm_mve_vreinterpretq:
-    return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
-                       Op.getOperand(1));
-  case Intrinsic::arm_mve_lsll:
-    return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
-                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-  case Intrinsic::arm_mve_asrl:
-    return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
-                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   }
 }
 
@@ -4315,7 +3878,7 @@ SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
     RC = &ARM::GPRRegClass;
 
   // Transform the arguments stored in physical registers into virtual ones.
-  Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
+  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
 
   SDValue ArgValue2;
@@ -4385,7 +3948,7 @@ int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
 
   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
-    Register VReg = MF.addLiveIn(Reg, RC);
+    unsigned VReg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
     SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
                                  MachinePointerInfo(OrigArg, 4 * i));
@@ -4419,42 +3982,6 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
 
-bool ARMTargetLowering::splitValueIntoRegisterParts(
-    SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
-    unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
-  bool IsABIRegCopy = CC.has_value();
-  EVT ValueVT = Val.getValueType();
-  if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
-      PartVT == MVT::f32) {
-    unsigned ValueBits = ValueVT.getSizeInBits();
-    unsigned PartBits = PartVT.getSizeInBits();
-    Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
-    Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
-    Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
-    Parts[0] = Val;
-    return true;
-  }
-  return false;
-}
-
-SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
-    SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
-    MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
-  bool IsABIRegCopy = CC.has_value();
-  if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
-      PartVT == MVT::f32) {
-    unsigned ValueBits = ValueVT.getSizeInBits();
-    unsigned PartBits = PartVT.getSizeInBits();
-    SDValue Val = Parts[0];
-
-    Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
-    Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
-    Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
-    return Val;
-  }
-  return SDValue();
-}
-
 SDValue ARMTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -4508,7 +4035,7 @@ SDValue ARMTargetLowering::LowerFormalArguments(
   int lastInsIndex = -1;
   if (isVarArg && MFI.hasVAStart()) {
     unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
-    if (RegIdx != std::size(GPRArgRegs))
+    if (RegIdx != array_lengthof(GPRArgRegs))
       ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
   }
 
@@ -4527,41 +4054,44 @@ SDValue ARMTargetLowering::LowerFormalArguments(
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
 
-      if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
+      if (VA.needsCustom()) {
         // f64 and vector types are split up into multiple registers or
         // combinations of registers and stack slots.
-        SDValue ArgValue1 =
-            GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
-        VA = ArgLocs[++i]; // skip ahead to next loc
-        SDValue ArgValue2;
-        if (VA.isMemLoc()) {
-          int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
-          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-          ArgValue2 = DAG.getLoad(
-              MVT::f64, dl, Chain, FIN,
-              MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
-        } else {
-          ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
-        }
-        ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
-        ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
-                               ArgValue1, DAG.getIntPtrConstant(0, dl));
-        ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
-                               ArgValue2, DAG.getIntPtrConstant(1, dl));
-      } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
-        ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+        if (VA.getLocVT() == MVT::v2f64) {
+          SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                                   Chain, DAG, dl);
+          VA = ArgLocs[++i]; // skip ahead to next loc
+          SDValue ArgValue2;
+          if (VA.isMemLoc()) {
+            int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
+                                    MachinePointerInfo::getFixedStack(
+                                        DAG.getMachineFunction(), FI));
+          } else {
+            ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                             Chain, DAG, dl);
+          }
+          ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue1,
+                                 DAG.getIntPtrConstant(0, dl));
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue2,
+                                 DAG.getIntPtrConstant(1, dl));
+        } else
+          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
       } else {
         const TargetRegisterClass *RC;
 
-        if (RegVT == MVT::f16 || RegVT == MVT::bf16)
+
+        if (RegVT == MVT::f16)
           RC = &ARM::HPRRegClass;
         else if (RegVT == MVT::f32)
           RC = &ARM::SPRRegClass;
-        else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
-                 RegVT == MVT::v4bf16)
+        else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)
           RC = &ARM::DPRRegClass;
-        else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
-                 RegVT == MVT::v8bf16)
+        else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)
           RC = &ARM::QPRRegClass;
         else if (RegVT == MVT::i32)
           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
@@ -4570,7 +4100,7 @@ SDValue ARMTargetLowering::LowerFormalArguments(
           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
         // Transform the arguments in physical registers into virtual ones.
-        Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
+        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
 
         // If this value is passed in r0 and has the returned attribute (e.g.
@@ -4601,16 +4131,9 @@ SDValue ARMTargetLowering::LowerFormalArguments(
         break;
       }
 
-      // f16 arguments have their size extended to 4 bytes and passed as if they
-      // had been copied to the LSBs of a 32-bit register.
-      // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
-      if (VA.needsCustom() &&
-          (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
-        ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
-
       InVals.push_back(ArgValue);
     } else { // VA.isRegLoc()
-      // Only arguments passed on the stack should make it here.
+      // sanity check
       assert(VA.isMemLoc());
       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
 
@@ -4653,35 +4176,12 @@ SDValue ARMTargetLowering::LowerFormalArguments(
   }
 
   // varargs
-  if (isVarArg && MFI.hasVAStart()) {
-    VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(),
+  if (isVarArg && MFI.hasVAStart())
+    VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
+                         CCInfo.getNextStackOffset(),
                          TotalArgRegsSaveSize);
-    if (AFI->isCmseNSEntryFunction()) {
-      DiagnosticInfoUnsupported Diag(
-          DAG.getMachineFunction().getFunction(),
-          "secure entry function must not be variadic", dl.getDebugLoc());
-      DAG.getContext()->diagnose(Diag);
-    }
-  }
 
-  unsigned StackArgSize = CCInfo.getNextStackOffset();
-  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
-  if (canGuaranteeTCO(CallConv, TailCallOpt)) {
-    // The only way to guarantee a tail call is if the callee restores its
-    // argument area, but it must also keep the stack aligned when doing so.
-    const DataLayout &DL = DAG.getDataLayout();
-    StackArgSize = alignTo(StackArgSize, DL.getStackAlignment());
-
-    AFI->setArgumentStackToRestore(StackArgSize);
-  }
-  AFI->setArgumentStackSize(StackArgSize);
-
-  if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) {
-    DiagnosticInfoUnsupported Diag(
-        DAG.getMachineFunction().getFunction(),
-        "secure entry function requires arguments on stack", dl.getDebugLoc());
-    DAG.getContext()->diagnose(Diag);
-  }
+  AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
 
   return Chain;
 }
@@ -5046,49 +4546,24 @@ SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
 }
 
-static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG,
-                              const ARMSubtarget *Subtarget) {
+static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG,
+                               const ARMSubtarget *Subtarget) {
   EVT VT = Op.getValueType();
-  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
+  if (!Subtarget->hasDSP())
     return SDValue();
   if (!VT.isSimple())
     return SDValue();
 
   unsigned NewOpcode;
+  bool IsAdd = Op->getOpcode() == ISD::SADDSAT;
   switch (VT.getSimpleVT().SimpleTy) {
   default:
     return SDValue();
   case MVT::i8:
-    switch (Op->getOpcode()) {
-    case ISD::UADDSAT:
-      NewOpcode = ARMISD::UQADD8b;
-      break;
-    case ISD::SADDSAT:
-      NewOpcode = ARMISD::QADD8b;
-      break;
-    case ISD::USUBSAT:
-      NewOpcode = ARMISD::UQSUB8b;
-      break;
-    case ISD::SSUBSAT:
-      NewOpcode = ARMISD::QSUB8b;
-      break;
-    }
+    NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b;
     break;
   case MVT::i16:
-    switch (Op->getOpcode()) {
-    case ISD::UADDSAT:
-      NewOpcode = ARMISD::UQADD16b;
-      break;
-    case ISD::SADDSAT:
-      NewOpcode = ARMISD::QADD16b;
-      break;
-    case ISD::USUBSAT:
-      NewOpcode = ARMISD::UQSUB16b;
-      break;
-    case ISD::SSUBSAT:
-      NewOpcode = ARMISD::QSUB16b;
-      break;
-    }
+    NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b;
     break;
   }
 
@@ -5268,6 +4743,16 @@ static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
 }
 
+// Similar to isLowerSaturate(), but checks for upper-saturating conditions.
+static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
+                            const SDValue TrueVal, const SDValue FalseVal,
+                            const ISD::CondCode CC, const SDValue K) {
+  return (isGTorGE(CC) &&
+          ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
+         (isLTorLE(CC) &&
+          ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
+}
+
 // Check if two chained conditionals could be converted into SSAT or USAT.
 //
 // SSAT can replace a set of two conditional selectors that bound a number to an
@@ -5279,68 +4764,101 @@ static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
 //     x < k ? (x < -k ? -k : x) : k
 //     etc.
 //
-// LLVM canonicalizes these to either a min(max()) or a max(min())
-// pattern. This function tries to match one of these and will return a SSAT
-// node if successful.
+// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is
+// a power of 2.
 //
-// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
-// is a power of 2.
-static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-  SDValue V1 = Op.getOperand(0);
-  SDValue K1 = Op.getOperand(1);
+// It returns true if the conversion can be done, false otherwise.
+// Additionally, the variable is returned in parameter V, the constant in K and
+// usat is set to true if the conditional represents an unsigned saturation
+static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
+                                    uint64_t &K, bool &usat) {
+  SDValue LHS1 = Op.getOperand(0);
+  SDValue RHS1 = Op.getOperand(1);
   SDValue TrueVal1 = Op.getOperand(2);
   SDValue FalseVal1 = Op.getOperand(3);
   ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
 
   const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
   if (Op2.getOpcode() != ISD::SELECT_CC)
-    return SDValue();
+    return false;
 
-  SDValue V2 = Op2.getOperand(0);
-  SDValue K2 = Op2.getOperand(1);
+  SDValue LHS2 = Op2.getOperand(0);
+  SDValue RHS2 = Op2.getOperand(1);
   SDValue TrueVal2 = Op2.getOperand(2);
   SDValue FalseVal2 = Op2.getOperand(3);
   ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
 
-  SDValue V1Tmp = V1;
-  SDValue V2Tmp = V2;
+  // Find out which are the constants and which are the variables
+  // in each conditional
+  SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
+                                                        ? &RHS1
+                                                        : nullptr;
+  SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
+                                                        ? &RHS2
+                                                        : nullptr;
+  SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
+  SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
+  SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
+  SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
+
+  // We must detect cases where the original operations worked with 16- or
+  // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
+  // must work with sign-extended values but the select operations return
+  // the original non-extended value.
+  SDValue V2TmpReg = V2Tmp;
+  if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
+    V2TmpReg = V2Tmp->getOperand(0);
+
+  // Check that the registers and the constants have the correct values
+  // in both conditionals
+  if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
+      V2TmpReg != V2)
+    return false;
 
-  // Check that the registers and the constants match a max(min()) or min(max())
-  // pattern
-  if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
-      K2 != FalseVal2 ||
-      !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
-    return SDValue();
+  // Figure out which conditional is saturating the lower/upper bound.
+  const SDValue *LowerCheckOp =
+      isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
+          ? &Op
+          : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
+                ? &Op2
+                : nullptr;
+  const SDValue *UpperCheckOp =
+      isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
+          ? &Op
+          : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
+                ? &Op2
+                : nullptr;
+
+  if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
+    return false;
 
   // Check that the constant in the lower-bound check is
   // the opposite of the constant in the upper-bound check
   // in 1's complement.
-  if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2))
-    return SDValue();
-
-  int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
-  int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
+  int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
+  int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
   int64_t PosVal = std::max(Val1, Val2);
   int64_t NegVal = std::min(Val1, Val2);
 
-  if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
-      !isPowerOf2_64(PosVal + 1))
-    return SDValue();
+  if (((Val1 > Val2 && UpperCheckOp == &Op) ||
+       (Val1 < Val2 && UpperCheckOp == &Op2)) &&
+      isPowerOf2_64(PosVal + 1)) {
 
-  // Handle the difference between USAT (unsigned) and SSAT (signed)
-  // saturation
-  // At this point, PosVal is guaranteed to be positive
-  uint64_t K = PosVal;
-  SDLoc dl(Op);
-  if (Val1 == ~Val2)
-    return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
-                       DAG.getConstant(countTrailingOnes(K), dl, VT));
-  if (NegVal == 0)
-    return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
-                       DAG.getConstant(countTrailingOnes(K), dl, VT));
+    // Handle the difference between USAT (unsigned) and SSAT (signed) saturation
+    if (Val1 == ~Val2)
+      usat = false;
+    else if (NegVal == 0)
+      usat = true;
+    else
+      return false;
 
-  return SDValue();
+    V = V2;
+    K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
+
+    return true;
+  }
+
+  return false;
 }
 
 // Check if a condition of the type x < k ? k : x can be converted into a
@@ -5400,9 +4918,18 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   // Try to convert two saturating conditional selects into a single SSAT
-  if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
-    if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
-      return SatValue;
+  SDValue SatValue;
+  uint64_t SatConstant;
+  bool SatUSat;
+  if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
+      isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) {
+    if (SatUSat)
+      return DAG.getNode(ARMISD::USAT, dl, VT, SatValue,
+                         DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
+    else
+      return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
+                         DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
+  }
 
   // Try to convert expressions of the form x < k ? k : x (and similar forms)
   // into more efficient bit operations, which is possible when k is 0 or -1
@@ -5411,7 +4938,6 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   // instructions.
   // Only allow this transformation on full-width (32-bit) operations
   SDValue LowerSatConstant;
-  SDValue SatValue;
   if (VT == MVT::i32 &&
       isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
     SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
@@ -5469,6 +4995,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
         std::swap(TVal, FVal);
         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
       }
+      if (TVal == 0)
+        TrueVal = DAG.getRegister(ARM::ZR, MVT::i32);
 
       // Drops F's value because we can get it by inverting/negating TVal.
       FalseVal = TrueVal;
@@ -5590,7 +5118,7 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
     return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
-                       Ld->getPointerInfo(), Ld->getAlign(),
+                       Ld->getPointerInfo(), Ld->getAlignment(),
                        Ld->getMemOperand()->getFlags());
 
   llvm_unreachable("Unknown VFP cmp argument!");
@@ -5610,14 +5138,14 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
     SDValue Ptr = Ld->getBasePtr();
     RetVal1 =
         DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
-                    Ld->getAlign(), Ld->getMemOperand()->getFlags());
+                    Ld->getAlignment(), Ld->getMemOperand()->getFlags());
 
     EVT PtrType = Ptr.getValueType();
+    unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
     RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
-                          Ld->getPointerInfo().getWithOffset(4),
-                          commonAlignment(Ld->getAlign(), 4),
+                          Ld->getPointerInfo().getWithOffset(4), NewAlign,
                           Ld->getMemOperand()->getFlags());
     return;
   }
@@ -5844,7 +5372,8 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
     return DAG.UnrollVectorOp(Op.getNode());
   }
 
-  const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
+  const bool HasFullFP16 =
+    static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
 
   EVT NewTy;
   const EVT OpTy = Op.getOperand(0).getValueType();
@@ -5903,43 +5432,6 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
-static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
-                                  const ARMSubtarget *Subtarget) {
-  EVT VT = Op.getValueType();
-  EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-  EVT FromVT = Op.getOperand(0).getValueType();
-
-  if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
-    return Op;
-  if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
-      Subtarget->hasFP64())
-    return Op;
-  if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
-      Subtarget->hasFullFP16())
-    return Op;
-  if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
-      Subtarget->hasMVEFloatOps())
-    return Op;
-  if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
-      Subtarget->hasMVEFloatOps())
-    return Op;
-
-  if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
-    return SDValue();
-
-  SDLoc DL(Op);
-  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
-  unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
-  SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
-                            DAG.getValueType(VT.getScalarType()));
-  SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
-                            DAG.getConstant((1 << BW) - 1, DL, VT));
-  if (IsSigned)
-    Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
-                      DAG.getConstant(-(1 << BW), DL, VT));
-  return Max;
-}
-
 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
@@ -5954,7 +5446,8 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
           Op.getOperand(0).getValueType() == MVT::v8i16) &&
          "Invalid type for custom lowering!");
 
-  const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
+  const bool HasFullFP16 =
+    static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
 
   EVT DestVecType;
   if (VT == MVT::v4f32)
@@ -6106,7 +5599,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
-  Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
 }
 
@@ -6216,27 +5709,85 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
 /// vectors), since the legalizer won't know what to do with that.
-SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
-                                         const ARMSubtarget *Subtarget) const {
+static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
+                             const ARMSubtarget *Subtarget) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc dl(N);
   SDValue Op = N->getOperand(0);
 
-  // This function is only supposed to be called for i16 and i64 types, either
-  // as the source or destination of the bit convert.
+  // This function is only supposed to be called for i64 types, either as the
+  // source or destination of the bit convert.
   EVT SrcVT = Op.getValueType();
   EVT DstVT = N->getValueType(0);
+  const bool HasFullFP16 = Subtarget->hasFullFP16();
+
+  if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
+     // FullFP16: half values are passed in S-registers, and we don't
+     // need any of the bitcast and moves:
+     //
+     // t2: f32,ch = CopyFromReg t0, Register:f32 %0
+     //   t5: i32 = bitcast t2
+     // t18: f16 = ARMISD::VMOVhr t5
+     if (Op.getOpcode() != ISD::CopyFromReg ||
+         Op.getValueType() != MVT::f32)
+       return SDValue();
+
+     auto Move = N->use_begin();
+     if (Move->getOpcode() != ARMISD::VMOVhr)
+       return SDValue();
+
+     SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
+     SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
+     DAG.ReplaceAllUsesWith(*Move, &Copy);
+     return Copy;
+  }
+
+  if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
+    if (!HasFullFP16)
+      return SDValue();
+    // SoftFP: read half-precision arguments:
+    //
+    // t2: i32,ch = ...
+    //        t7: i16 = truncate t2 <~~~~ Op
+    //      t8: f16 = bitcast t7    <~~~~ N
+    //
+    if (Op.getOperand(0).getValueType() == MVT::i32)
+      return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
+                         MVT::f16, Op.getOperand(0));
 
-  if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
-      (DstVT == MVT::f16 || DstVT == MVT::bf16))
-    return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
-                     DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
+    return SDValue();
+  }
 
-  if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
-      (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
-    return DAG.getNode(
-        ISD::TRUNCATE, SDLoc(N), DstVT,
-        MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
+  // Half-precision return values
+  if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
+    if (!HasFullFP16)
+      return SDValue();
+    //
+    //          t11: f16 = fadd t8, t10
+    //        t12: i16 = bitcast t11       <~~~ SDNode N
+    //      t13: i32 = zero_extend t12
+    //    t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
+    //  t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
+    //
+    // transform this into:
+    //
+    //    t20: i32 = ARMISD::VMOVrh t11
+    //  t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
+    //
+    auto ZeroExtend = N->use_begin();
+    if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
+        ZeroExtend->getValueType(0) != MVT::i32)
+      return SDValue();
+
+    auto Copy = ZeroExtend->use_begin();
+    if (Copy->getOpcode() == ISD::CopyToReg &&
+        Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
+      SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
+      DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
+      return Cvt;
+    }
+    return SDValue();
+  }
 
   if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
     return SDValue();
@@ -6372,69 +5923,23 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
   return DAG.getMergeValues(Ops, dl);
 }
 
-SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
-                                             SelectionDAG &DAG) const {
+SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                            SelectionDAG &DAG) const {
   // The rounding mode is in bits 23:22 of the FPSCR.
   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
   // so that the shift + and get folded into a bitfield extract.
   SDLoc dl(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Ops[] = {Chain,
-                   DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
+  SDValue Ops[] = { DAG.getEntryNode(),
+                    DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };
 
-  SDValue FPSCR =
-      DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
-  Chain = FPSCR.getValue(1);
+  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
                                   DAG.getConstant(1U << 22, dl, MVT::i32));
   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
                               DAG.getConstant(22, dl, MVT::i32));
-  SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
-                            DAG.getConstant(3, dl, MVT::i32));
-  return DAG.getMergeValues({And, Chain}, dl);
-}
-
-SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  SDValue Chain = Op->getOperand(0);
-  SDValue RMValue = Op->getOperand(1);
-
-  // The rounding mode is in bits 23:22 of the FPSCR.
-  // The llvm.set.rounding argument value to ARM rounding mode value mapping
-  // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
-  // ((arg - 1) & 3) << 22).
-  //
-  // It is expected that the argument of llvm.set.rounding is within the
-  // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
-  // responsibility of the code generated llvm.set.rounding to ensure this
-  // condition.
-
-  // Calculate new value of FPSCR[23:22].
-  RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
-                        DAG.getConstant(1, DL, MVT::i32));
-  RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
-                        DAG.getConstant(0x3, DL, MVT::i32));
-  RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
-                        DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
-
-  // Get current value of FPSCR.
-  SDValue Ops[] = {Chain,
-                   DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
-  SDValue FPSCR =
-      DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
-  Chain = FPSCR.getValue(1);
-  FPSCR = FPSCR.getValue(0);
-
-  // Put new rounding mode into FPSCR[23:22].
-  const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
-  FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
-                      DAG.getConstant(RMMask, DL, MVT::i32));
-  FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
-  SDValue Ops2[] = {
-      Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
-  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
+  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+                     DAG.getConstant(3, dl, MVT::i32));
 }
 
 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
@@ -6766,23 +6271,23 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
       if (ST->hasMVEFloatOps()) {
         Opc = ARMCC::NE; break;
       } else {
-        Invert = true; [[fallthrough]];
+        Invert = true; LLVM_FALLTHROUGH;
       }
     case ISD::SETOEQ:
     case ISD::SETEQ:  Opc = ARMCC::EQ; break;
     case ISD::SETOLT:
-    case ISD::SETLT: Swap = true; [[fallthrough]];
+    case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETOGT:
     case ISD::SETGT:  Opc = ARMCC::GT; break;
     case ISD::SETOLE:
-    case ISD::SETLE:  Swap = true; [[fallthrough]];
+    case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETOGE:
     case ISD::SETGE: Opc = ARMCC::GE; break;
-    case ISD::SETUGE: Swap = true; [[fallthrough]];
+    case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
-    case ISD::SETUGT: Swap = true; [[fallthrough]];
+    case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
-    case ISD::SETUEQ: Invert = true; [[fallthrough]];
+    case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
     case ISD::SETONE: {
       // Expand this to (OLT | OGT).
       SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
@@ -6794,7 +6299,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
         Result = DAG.getNOT(dl, Result, VT);
       return Result;
     }
-    case ISD::SETUO: Invert = true; [[fallthrough]];
+    case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH;
     case ISD::SETO: {
       // Expand this to (OLT | OGE).
       SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
@@ -6815,16 +6320,16 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
       if (ST->hasMVEIntegerOps()) {
         Opc = ARMCC::NE; break;
       } else {
-        Invert = true; [[fallthrough]];
+        Invert = true; LLVM_FALLTHROUGH;
       }
     case ISD::SETEQ:  Opc = ARMCC::EQ; break;
-    case ISD::SETLT:  Swap = true; [[fallthrough]];
+    case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETGT:  Opc = ARMCC::GT; break;
-    case ISD::SETLE:  Swap = true; [[fallthrough]];
+    case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETGE:  Opc = ARMCC::GE; break;
-    case ISD::SETULT: Swap = true; [[fallthrough]];
+    case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETUGT: Opc = ARMCC::HI; break;
-    case ISD::SETULE: Swap = true; [[fallthrough]];
+    case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETUGE: Opc = ARMCC::HS; break;
     }
 
@@ -6856,25 +6361,25 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
 
   // If one of the operands is a constant vector zero, attempt to fold the
   // comparison to a specialized compare-against-zero form.
-  if (ISD::isBuildVectorAllZeros(Op0.getNode()) &&
-      (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
-       Opc == ARMCC::NE)) {
+  SDValue SingleOp;
+  if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+    SingleOp = Op0;
+  else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
     if (Opc == ARMCC::GE)
       Opc = ARMCC::LE;
     else if (Opc == ARMCC::GT)
       Opc = ARMCC::LT;
-    std::swap(Op0, Op1);
+    SingleOp = Op1;
   }
 
   SDValue Result;
-  if (ISD::isBuildVectorAllZeros(Op1.getNode()) &&
-      (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
-       Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
-    Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
+  if (SingleOp.getNode()) {
+    Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp,
                          DAG.getConstant(Opc, dl, MVT::i32));
-  else
+  } else {
     Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
                          DAG.getConstant(Opc, dl, MVT::i32));
+  }
 
   Result = DAG.getSExtOrTrunc(Result, dl, VT);
 
@@ -6919,10 +6424,9 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
 /// immediate" operand (e.g., VMOV).  If so, return the encoded value.
 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
                                  unsigned SplatBitSize, SelectionDAG &DAG,
-                                 const SDLoc &dl, EVT &VT, EVT VectorVT,
+                                 const SDLoc &dl, EVT &VT, bool is128Bits,
                                  VMOVModImmType type) {
   unsigned OpCmode, Imm;
-  bool is128Bits = VectorVT.is128BitVector();
 
   // SplatBitSize is set to the smallest size that splats the vector, so a
   // zero vector will always have SplatBitSize == 8.  However, NEON modified
@@ -7026,10 +6530,12 @@ static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
       return SDValue();
     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
     uint64_t BitMask = 0xff;
+    uint64_t Val = 0;
     unsigned ImmMask = 1;
     Imm = 0;
     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
+        Val |= BitMask;
         Imm |= ImmMask;
       } else if ((SplatBits & BitMask) != 0) {
         return SDValue();
@@ -7038,18 +6544,9 @@ static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
       ImmMask <<= 1;
     }
 
-    if (DAG.getDataLayout().isBigEndian()) {
-      // Reverse the order of elements within the vector.
-      unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
-      unsigned Mask = (1 << BytesPerElem) - 1;
-      unsigned NumElems = 8 / BytesPerElem;
-      unsigned NewImm = 0;
-      for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
-        unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
-        NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
-      }
-      Imm = NewImm;
-    }
+    if (DAG.getDataLayout().isBigEndian())
+      // swap higher and lower 32 bit word
+      Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
 
     // Op=1, Cmode=1110.
     OpCmode = 0x1e;
@@ -7088,6 +6585,8 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
       case MVT::f64: {
         SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
         SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
+        if (!ST->isLittle())
+          std::swap(Lo, Hi);
         return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
       }
       case MVT::f32:
@@ -7140,7 +6639,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
 
   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
   SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
-                                     VMovVT, VT, VMOVModImm);
+                                     VMovVT, false, VMOVModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
@@ -7157,7 +6656,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
 
   // Finally, try a VMVN.i32
   NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
-                             VT, VMVNModImm);
+                             false, VMVNModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
@@ -7241,6 +6740,35 @@ static bool isVEXTMask(ArrayRef<int> M, EVT VT,
   return true;
 }
 
+/// isVREVMask - Check if a vector shuffle corresponds to a VREV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
+         "Only possible block sizes for VREV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getScalarSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
   // range, then 0 is placed into the resulting vector. So pretty much any mask
@@ -7513,33 +7041,11 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) {
   return true;
 }
 
-static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
+static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) {
   unsigned NumElts = VT.getVectorNumElements();
   // Make sure the mask has the right size.
   if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
-    return false;
-
-  // Half-width truncation patterns (e.g. v4i32 -> v8i16):
-  // !Top &&  SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
-  // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
-  //  Top &&  SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
-  //  Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
-  int Ofs = Top ? 1 : 0;
-  int Upper = SingleSource ? 0 : NumElts;
-  for (int i = 0, e = NumElts / 2; i != e; ++i) {
-    if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
-      return false;
-    if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
       return false;
-  }
-  return true;
-}
-
-static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
-  unsigned NumElts = VT.getVectorNumElements();
-  // Make sure the mask has the right size.
-  if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
-    return false;
 
   // If Top
   //   Look for <0, N, 2, N+2, 4, N+4, ..>.
@@ -7548,137 +7054,16 @@ static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
   //   Look for <0, N+1, 2, N+3, 4, N+5, ..>
   //   This inserts Input1 into Input2
   unsigned Offset = Top ? 0 : 1;
-  unsigned N = SingleSource ? 0 : NumElts;
-  for (unsigned i = 0; i < NumElts; i += 2) {
+  for (unsigned i = 0; i < NumElts; i+=2) {
     if (M[i] >= 0 && M[i] != (int)i)
       return false;
-    if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
+    if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset))
       return false;
   }
 
   return true;
 }
 
-static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
-  unsigned NumElts = ToVT.getVectorNumElements();
-  if (NumElts != M.size())
-    return false;
-
-  // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
-  // looking for patterns of:
-  // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
-  //  rev: N/2 0 N/2+1 1 N/2+2 2 ...
-
-  unsigned Off0 = rev ? NumElts / 2 : 0;
-  unsigned Off1 = rev ? 0 : NumElts / 2;
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
-      return false;
-    if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
-      return false;
-  }
-
-  return true;
-}
-
-// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
-// from a pair of inputs. For example:
-// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
-//             FP_ROUND(EXTRACT_ELT(Y, 0),
-//             FP_ROUND(EXTRACT_ELT(X, 1),
-//             FP_ROUND(EXTRACT_ELT(Y, 1), ...)
-static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG,
-                                         const ARMSubtarget *ST) {
-  assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
-  if (!ST->hasMVEFloatOps())
-    return SDValue();
-
-  SDLoc dl(BV);
-  EVT VT = BV.getValueType();
-  if (VT != MVT::v8f16)
-    return SDValue();
-
-  // We are looking for a buildvector of fptrunc elements, where all the
-  // elements are interleavingly extracted from two sources. Check the first two
-  // items are valid enough and extract some info from them (they are checked
-  // properly in the loop below).
-  if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
-      BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-      BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0)
-    return SDValue();
-  if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
-      BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-      BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0)
-    return SDValue();
-  SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
-  SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
-  if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
-    return SDValue();
-
-  // Check all the values in the BuildVector line up with our expectations.
-  for (unsigned i = 1; i < 4; i++) {
-    auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
-      return Trunc.getOpcode() == ISD::FP_ROUND &&
-             Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-             Trunc.getOperand(0).getOperand(0) == Op &&
-             Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
-    };
-    if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
-      return SDValue();
-    if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
-      return SDValue();
-  }
-
-  SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
-                           DAG.getConstant(0, dl, MVT::i32));
-  return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
-                     DAG.getConstant(1, dl, MVT::i32));
-}
-
-// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
-// from a single input on alternating lanes. For example:
-// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
-//             FP_ROUND(EXTRACT_ELT(X, 2),
-//             FP_ROUND(EXTRACT_ELT(X, 4), ...)
-static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG,
-                                       const ARMSubtarget *ST) {
-  assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
-  if (!ST->hasMVEFloatOps())
-    return SDValue();
-
-  SDLoc dl(BV);
-  EVT VT = BV.getValueType();
-  if (VT != MVT::v4f32)
-    return SDValue();
-
-  // We are looking for a buildvector of fptext elements, where all the
-  // elements are alternating lanes from a single source. For example <0,2,4,6>
-  // or <1,3,5,7>. Check the first two items are valid enough and extract some
-  // info from them (they are checked properly in the loop below).
-  if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
-      BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-    return SDValue();
-  SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
-  int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1);
-  if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
-    return SDValue();
-
-  // Check all the values in the BuildVector line up with our expectations.
-  for (unsigned i = 1; i < 4; i++) {
-    auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
-      return Trunc.getOpcode() == ISD::FP_EXTEND &&
-             Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-             Trunc.getOperand(0).getOperand(0) == Op &&
-             Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
-    };
-    if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
-      return SDValue();
-  }
-
-  return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
-                     DAG.getConstant(Offset, dl, MVT::i32));
-}
-
 // If N is an integer constant that can be moved into a register in one
 // instruction, return an SDValue of such a constant (will become a MOV
 // instruction).  Otherwise return null.
@@ -7709,10 +7094,7 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
   unsigned NumElts = VT.getVectorNumElements();
   unsigned BoolMask;
   unsigned BitsPerBool;
-  if (NumElts == 2) {
-    BitsPerBool = 8;
-    BoolMask = 0xff;
-  } else if (NumElts == 4) {
+  if (NumElts == 4) {
     BitsPerBool = 4;
     BoolMask = 0xf;
   } else if (NumElts == 8) {
@@ -7728,9 +7110,10 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
   // extend that single value
   SDValue FirstOp = Op.getOperand(0);
   if (!isa<ConstantSDNode>(FirstOp) &&
-      llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
-        return U.get().isUndef() || U.get() == FirstOp;
-      })) {
+      std::all_of(std::next(Op->op_begin()), Op->op_end(),
+                  [&FirstOp](SDUse &U) {
+                    return U.get().isUndef() || U.get() == FirstOp;
+                  })) {
     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
                               DAG.getValueType(MVT::i1));
     return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
@@ -7761,79 +7144,6 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
   return Base;
 }
 
-static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG,
-                                        const ARMSubtarget *ST) {
-  if (!ST->hasMVEIntegerOps())
-    return SDValue();
-
-  // We are looking for a buildvector where each element is Op[0] + i*N
-  EVT VT = Op.getValueType();
-  SDValue Op0 = Op.getOperand(0);
-  unsigned NumElts = VT.getVectorNumElements();
-
-  // Get the increment value from operand 1
-  SDValue Op1 = Op.getOperand(1);
-  if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
-      !isa<ConstantSDNode>(Op1.getOperand(1)))
-    return SDValue();
-  unsigned N = Op1.getConstantOperandVal(1);
-  if (N != 1 && N != 2 && N != 4 && N != 8)
-    return SDValue();
-
-  // Check that each other operand matches
-  for (unsigned I = 2; I < NumElts; I++) {
-    SDValue OpI = Op.getOperand(I);
-    if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
-        !isa<ConstantSDNode>(OpI.getOperand(1)) ||
-        OpI.getConstantOperandVal(1) != I * N)
-      return SDValue();
-  }
-
-  SDLoc DL(Op);
-  return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
-                     DAG.getConstant(N, DL, MVT::i32));
-}
-
-// Returns true if the operation N can be treated as qr instruction variant at
-// operand Op.
-static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
-  switch (N->getOpcode()) {
-  case ISD::ADD:
-  case ISD::MUL:
-  case ISD::SADDSAT:
-  case ISD::UADDSAT:
-    return true;
-  case ISD::SUB:
-  case ISD::SSUBSAT:
-  case ISD::USUBSAT:
-    return N->getOperand(1).getNode() == Op;
-  case ISD::INTRINSIC_WO_CHAIN:
-    switch (N->getConstantOperandVal(0)) {
-    case Intrinsic::arm_mve_add_predicated:
-    case Intrinsic::arm_mve_mul_predicated:
-    case Intrinsic::arm_mve_qadd_predicated:
-    case Intrinsic::arm_mve_vhadd:
-    case Intrinsic::arm_mve_hadd_predicated:
-    case Intrinsic::arm_mve_vqdmulh:
-    case Intrinsic::arm_mve_qdmulh_predicated:
-    case Intrinsic::arm_mve_vqrdmulh:
-    case Intrinsic::arm_mve_qrdmulh_predicated:
-    case Intrinsic::arm_mve_vqdmull:
-    case Intrinsic::arm_mve_vqdmull_predicated:
-      return true;
-    case Intrinsic::arm_mve_sub_predicated:
-    case Intrinsic::arm_mve_qsub_predicated:
-    case Intrinsic::arm_mve_vhsub:
-    case Intrinsic::arm_mve_hsub_predicated:
-      return N->getOperand(2).getNode() == Op;
-    default:
-      return false;
-    }
-  default:
-    return false;
-  }
-}
-
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.
 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
@@ -7845,37 +7155,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
     return LowerBUILD_VECTOR_i1(Op, DAG, ST);
 
-  if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
-    return R;
-
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatUndef.isAllOnes())
+    if (SplatUndef.isAllOnesValue())
       return DAG.getUNDEF(VT);
 
-    // If all the users of this constant splat are qr instruction variants,
-    // generate a vdup of the constant.
-    if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
-        (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
-        all_of(BVN->uses(),
-               [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
-      EVT DupVT = SplatBitSize == 32   ? MVT::v4i32
-                  : SplatBitSize == 16 ? MVT::v8i16
-                                       : MVT::v16i8;
-      SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
-      SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
-      return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
-    }
-
     if ((ST->hasNEON() && SplatBitSize <= 64) ||
-        (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
+        (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
       // Check if an immediate VMOV works.
       EVT VmovVT;
-      SDValue Val =
-          isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
-                            SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
+      SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, dl, VmovVT, VT.is128BitVector(),
+                                      VMOVModImm);
 
       if (Val.getNode()) {
         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
@@ -7885,8 +7179,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       // Try an immediate VMVN.
       uint64_t NegatedImm = (~SplatBits).getZExtValue();
       Val = isVMOVModifiedImm(
-          NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
-          VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
+          NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
+          DAG, dl, VmovVT, VT.is128BitVector(),
+          ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
       if (Val.getNode()) {
         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
@@ -7900,18 +7195,6 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
         }
       }
-
-      // If we are under MVE, generate a VDUP(constant), bitcast to the original
-      // type.
-      if (ST->hasMVEIntegerOps() &&
-          (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
-        EVT DupVT = SplatBitSize == 32   ? MVT::v4i32
-                    : SplatBitSize == 16 ? MVT::v8i16
-                                         : MVT::v16i8;
-        SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
-        SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
-        return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
-      }
     }
   }
 
@@ -8038,19 +7321,12 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   if (isConstant)
     return SDValue();
 
-  // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
-  // vmovn). Empirical tests suggest this is rarely worth it for vectors of
-  // length <= 2.
-  if (NumElts >= 4)
-    if (SDValue shuffle = ReconstructShuffle(Op, DAG))
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    SDValue shuffle = ReconstructShuffle(Op, DAG);
+    if (shuffle != SDValue())
       return shuffle;
-
-  // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
-  // VCVT's
-  if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
-    return VCVT;
-  if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
-    return VCVT;
+  }
 
   if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
     // If we haven't found an efficient lowering, try splitting a 128-bit vector
@@ -8058,11 +7334,12 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
     EVT ExtVT = VT.getVectorElementType();
     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
-    SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
+    SDValue Lower =
+        DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
     if (Lower.getOpcode() == ISD::BUILD_VECTOR)
       Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
-    SDValue Upper =
-        DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
+    SDValue Upper = DAG.getBuildVector(
+        HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
     if (Upper.getOpcode() == ISD::BUILD_VECTOR)
       Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
     if (Lower && Upper)
@@ -8187,19 +7464,17 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
   for (auto &Src : Sources) {
     EVT SrcVT = Src.ShuffleVec.getValueType();
 
-    uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
-    uint64_t VTSize = VT.getFixedSizeInBits();
-    if (SrcVTSize == VTSize)
+    if (SrcVT.getSizeInBits() == VT.getSizeInBits())
       continue;
 
     // This stage of the search produces a source with the same element type as
     // the original, but with a total width matching the BUILD_VECTOR output.
     EVT EltVT = SrcVT.getVectorElementType();
-    unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
+    unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
 
-    if (SrcVTSize < VTSize) {
-      if (2 * SrcVTSize != VTSize)
+    if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+      if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
         return SDValue();
       // We can pad out the smaller vector for free, so if it's part of a
       // shuffle...
@@ -8209,7 +7484,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
       continue;
     }
 
-    if (SrcVTSize != 2 * VTSize)
+    if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
       return SDValue();
 
     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
@@ -8252,12 +7527,12 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
     if (SrcEltTy == SmallestEltTy)
       continue;
     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
-    Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
     Src.WindowBase *= Src.WindowScale;
   }
 
-  // Final check before we try to actually produce a shuffle.
+  // Final sanity check before we try to actually produce a shuffle.
   LLVM_DEBUG(for (auto Src
                   : Sources)
                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
@@ -8277,7 +7552,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
     // segment.
     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
-    int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
+    int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
                                VT.getScalarSizeInBits());
     int LanesDefined = BitsDefined / BitsPerShuffleLane;
 
@@ -8304,7 +7579,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
                                             ShuffleOps[1], Mask, DAG);
   if (!Shuffle)
     return SDValue();
-  return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
 enum ShuffleOpCodes {
@@ -8380,17 +7655,11 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
             isVTBLMask(M, VT) ||
             isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
     return true;
-  else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
+  else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
            isReverseMask(M, VT))
     return true;
   else if (Subtarget->hasMVEIntegerOps() &&
-           (isVMOVNMask(M, VT, true, false) ||
-            isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
-    return true;
-  else if (Subtarget->hasMVEIntegerOps() &&
-           (isTruncMask(M, VT, false, false) ||
-            isTruncMask(M, VT, false, true) ||
-            isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
+           (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1)))
     return true;
   else
     return false;
@@ -8420,13 +7689,14 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
   default: llvm_unreachable("Unknown shuffle opcode!");
   case OP_VREV:
     // VREV divides the vector in half and swaps within the half.
-    if (VT.getScalarSizeInBits() == 32)
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
     // vrev <4 x i16> -> VREV32
-    if (VT.getScalarSizeInBits() == 16)
+    if (VT.getVectorElementType() == MVT::i16)
       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
     // vrev <4 x i8> -> VREV16
-    assert(VT.getScalarSizeInBits() == 8);
+    assert(VT.getVectorElementType() == MVT::i8);
     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
   case OP_VDUP0:
   case OP_VDUP1:
@@ -8464,8 +7734,9 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
   SDLoc DL(Op);
 
   SmallVector<SDValue, 8> VTBLMask;
-  for (int I : ShuffleMask)
-    VTBLMask.push_back(DAG.getConstant(I, DL, MVT::i32));
+  for (ArrayRef<int>::iterator
+         I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
+    VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
 
   if (V2.getNode()->isUndef())
     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
@@ -8475,29 +7746,25 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
                      DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
 }
 
-static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
+                                                      SelectionDAG &DAG) {
   SDLoc DL(Op);
-  EVT VT = Op.getValueType();
+  SDValue OpLHS = Op.getOperand(0);
+  EVT VT = OpLHS.getValueType();
 
-  assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
+  assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
          "Expect an v8i16/v16i8 type");
-  SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
-  // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
+  OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
+  // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
   // extract the first 8 bytes into the top double word and the last 8 bytes
-  // into the bottom double word, through a new vector shuffle that will be
-  // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
-  std::vector<int> NewMask;
-  for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
-    NewMask.push_back(VT.getVectorNumElements() / 2 + i);
-  for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
-    NewMask.push_back(i);
-  return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
+  // into the bottom double word. The v8i16 case is similar.
+  unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
+  return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
+                     DAG.getConstant(ExtractNum, DL, MVT::i32));
 }
 
 static EVT getVectorTyFromPredicateVector(EVT VT) {
   switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::v2i1:
-    return MVT::v2f64;
   case MVT::v4i1:
     return MVT::v4i32;
   case MVT::v8i1:
@@ -8554,7 +7821,6 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
          "No support for vector shuffle of boolean predicates");
 
   SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
   SDLoc dl(Op);
   if (isReverseMask(ShuffleMask, VT)) {
     SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
@@ -8572,26 +7838,15 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
   // many cases the generated code might be even better than scalar code
   // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
   // fields in a register into 8 other arbitrary 2-bit fields!
-  SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
-  EVT NewVT = PredAsVector1.getValueType();
-  SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
-                                       : PromoteMVEPredVector(dl, V2, VT, DAG);
-  assert(PredAsVector2.getValueType() == NewVT &&
-         "Expected identical vector type in expanded i1 shuffle!");
+  SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG);
+  EVT NewVT = PredAsVector.getValueType();
 
   // Do the shuffle!
-  SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
-                                          PredAsVector2, ShuffleMask);
+  SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector,
+                                          DAG.getUNDEF(NewVT), ShuffleMask);
 
   // Now return the result of comparing the shuffled vector with zero,
-  // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
-  // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
-  if (VT == MVT::v2i1) {
-    SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
-    SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
-                              DAG.getConstant(ARMCC::NE, dl, MVT::i32));
-    return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
-  }
+  // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
   return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
 }
@@ -8649,8 +7904,8 @@ static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
         Input = Op->getOperand(1);
         Elt -= 4;
       }
-      SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
-      Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
+      SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input);
+      Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast,
                                 DAG.getConstant(Elt, dl, MVT::i32));
     }
   }
@@ -8669,70 +7924,19 @@ static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
             Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
     SDValue NewShuffle = DAG.getVectorShuffle(
         VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
-    SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
+    SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle);
 
     for (int Part = 0; Part < 4; ++Part)
       if (!Parts[Part])
-        Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
+        Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                   BitCast, DAG.getConstant(Part, dl, MVT::i32));
   }
   // Build a vector out of the various parts and bitcast it back to the original
   // type.
-  SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
+  SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts);
   return DAG.getBitcast(VT, NewVec);
 }
 
-static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op,
-                                              ArrayRef<int> ShuffleMask,
-                                              SelectionDAG &DAG) {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  EVT VT = Op.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
-
-  // An One-Off Identity mask is one that is mostly an identity mask from as
-  // single source but contains a single element out-of-place, either from a
-  // different vector or from another position in the same vector. As opposed to
-  // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
-  // pair directly.
-  auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
-                                 int &OffElement) {
-    OffElement = -1;
-    int NonUndef = 0;
-    for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
-      if (Mask[i] == -1)
-        continue;
-      NonUndef++;
-      if (Mask[i] != i + BaseOffset) {
-        if (OffElement == -1)
-          OffElement = i;
-        else
-          return false;
-      }
-    }
-    return NonUndef > 2 && OffElement != -1;
-  };
-  int OffElement;
-  SDValue VInput;
-  if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
-    VInput = V1;
-  else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
-    VInput = V2;
-  else
-    return SDValue();
-
-  SDLoc dl(Op);
-  EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
-                ? MVT::i32
-                : VT.getScalarType();
-  SDValue Elt = DAG.getNode(
-      ISD::EXTRACT_VECTOR_ELT, dl, SVT,
-      ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
-      DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
-  return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
-                     DAG.getVectorIdxConstant(OffElement % NumElts, dl));
-}
-
 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                                    const ARMSubtarget *ST) {
   SDValue V1 = Op.getOperand(0);
@@ -8819,15 +8023,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
       }
     }
     if (ST->hasMVEIntegerOps()) {
-      if (isVMOVNMask(ShuffleMask, VT, false, false))
+      if (isVMOVNMask(ShuffleMask, VT, 0))
         return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
                            DAG.getConstant(0, dl, MVT::i32));
-      if (isVMOVNMask(ShuffleMask, VT, true, false))
+      if (isVMOVNMask(ShuffleMask, VT, 1))
         return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
                            DAG.getConstant(1, dl, MVT::i32));
-      if (isVMOVNMask(ShuffleMask, VT, true, true))
-        return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
-                           DAG.getConstant(1, dl, MVT::i32));
     }
 
     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
@@ -8869,29 +8070,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     }
   }
 
-  if (ST->hasMVEIntegerOps() && EltSize <= 32) {
-    if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
-      return V;
-
-    for (bool Top : {false, true}) {
-      for (bool SingleSource : {false, true}) {
-        if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
-          MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
-          MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
-          SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
-          SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
-                                   SingleSource ? V1 : V2);
-          if (Top) {
-            SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
-            Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
-            Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
-          }
-          return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
-        }
-      }
-    }
-  }
-
   // If the shuffle is not directly supported and it has 4 elements, use
   // the PerfectShuffle-generated table to synthesize it from other shuffles.
   unsigned NumElts = VT.getVectorNumElements();
@@ -8946,9 +8124,8 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
-  if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
-      isReverseMask(ShuffleMask, VT))
-    return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
+  if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
+    return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
 
   if (ST->hasNEON() && VT == MVT::v8i8)
     if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
@@ -9065,75 +8242,54 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
 
 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
                                       const ARMSubtarget *ST) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
   SDLoc dl(Op);
-  assert(Op.getValueType().getScalarSizeInBits() == 1 &&
-         "Unexpected custom CONCAT_VECTORS lowering");
-  assert(isPowerOf2_32(Op.getNumOperands()) &&
+  EVT VT = Op.getValueType();
+  EVT Op1VT = V1.getValueType();
+  EVT Op2VT = V2.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert(Op1VT == Op2VT && "Operand types don't match!");
+  assert(VT.getScalarSizeInBits() == 1 &&
          "Unexpected custom CONCAT_VECTORS lowering");
   assert(ST->hasMVEIntegerOps() &&
          "CONCAT_VECTORS lowering only supported for MVE");
 
-  auto ConcatPair = [&](SDValue V1, SDValue V2) {
-    EVT Op1VT = V1.getValueType();
-    EVT Op2VT = V2.getValueType();
-    assert(Op1VT == Op2VT && "Operand types don't match!");
-    EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
-
-    SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
-    SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
-
-    // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
-    // promoted to v8i16, etc.
-    MVT ElType =
-        getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
-    unsigned NumElts = 2 * Op1VT.getVectorNumElements();
-
-    // Extract the vector elements from Op1 and Op2 one by one and truncate them
-    // to be the right size for the destination. For example, if Op1 is v4i1
-    // then the promoted vector is v4i32. The result of concatenation gives a
-    // v8i1, which when promoted is v8i16. That means each i32 element from Op1
-    // needs truncating to i16 and inserting in the result.
-    EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
-    SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
-    auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
-      EVT NewVT = NewV.getValueType();
-      EVT ConcatVT = ConVec.getValueType();
-      for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
-        SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
-                                  DAG.getIntPtrConstant(i, dl));
-        ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
-                             DAG.getConstant(j, dl, MVT::i32));
-      }
-      return ConVec;
-    };
-    unsigned j = 0;
-    ConVec = ExtractInto(NewV1, ConVec, j);
-    ConVec = ExtractInto(NewV2, ConVec, j);
-
-    // Now return the result of comparing the subvector with zero, which will
-    // generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 we
-    // convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
-    if (VT == MVT::v2i1) {
-      SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, ConVec);
-      SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
-                                DAG.getConstant(ARMCC::NE, dl, MVT::i32));
-      return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
+  SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
+  SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
+
+  // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
+  // promoted to v8i16, etc.
+
+  MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
+
+  // Extract the vector elements from Op1 and Op2 one by one and truncate them
+  // to be the right size for the destination. For example, if Op1 is v4i1 then
+  // the promoted vector is v4i32. The result of concatentation gives a v8i1,
+  // which when promoted is v8i16. That means each i32 element from Op1 needs
+  // truncating to i16 and inserting in the result.
+  EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
+  SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
+  auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
+    EVT NewVT = NewV.getValueType();
+    EVT ConcatVT = ConVec.getValueType();
+    for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
+                                DAG.getIntPtrConstant(i, dl));
+      ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
+                           DAG.getConstant(j, dl, MVT::i32));
     }
-    return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
-                       DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+    return ConVec;
   };
+  unsigned j = 0;
+  ConVec = ExractInto(NewV1, ConVec, j);
+  ConVec = ExractInto(NewV2, ConVec, j);
 
-  // Concat each pair of subvectors and pack into the lower half of the array.
-  SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
-  while (ConcatOps.size() > 1) {
-    for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
-      SDValue V1 = ConcatOps[I];
-      SDValue V2 = ConcatOps[I + 1];
-      ConcatOps[I / 2] = ConcatPair(V1, V2);
-    }
-    ConcatOps.resize(ConcatOps.size() / 2);
-  }
-  return ConcatOps[0];
+  // Now return the result of comparing the subvector with zero,
+  // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+  return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
+                     DAG.getConstant(ARMCC::NE, dl, MVT::i32));
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
@@ -9183,22 +8339,6 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
 
   MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
 
-  if (NumElts == 2) {
-    EVT SubVT = MVT::v4i32;
-    SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
-    for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
-      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
-                                DAG.getIntPtrConstant(i, dl));
-      SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
-                           DAG.getConstant(j, dl, MVT::i32));
-      SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
-                           DAG.getConstant(j + 1, dl, MVT::i32));
-    }
-    SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
-                              DAG.getConstant(ARMCC::NE, dl, MVT::i32));
-    return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
-  }
-
   EVT SubVT = MVT::getVectorVT(ElType, NumElts);
   SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
   for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
@@ -9214,116 +8354,6 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
 }
 
-// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
-static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG,
-                               const ARMSubtarget *ST) {
-  assert(ST->hasMVEIntegerOps() && "Expected MVE!");
-  EVT VT = N->getValueType(0);
-  assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
-         "Expected a vector i1 type!");
-  SDValue Op = N->getOperand(0);
-  EVT FromVT = Op.getValueType();
-  SDLoc DL(N);
-
-  SDValue And =
-      DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
-  return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
-                     DAG.getCondCode(ISD::SETNE));
-}
-
-static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG,
-                             const ARMSubtarget *Subtarget) {
-  if (!Subtarget->hasMVEIntegerOps())
-    return SDValue();
-
-  EVT ToVT = N->getValueType(0);
-  if (ToVT.getScalarType() == MVT::i1)
-    return LowerTruncatei1(N, DAG, Subtarget);
-
-  // MVE does not have a single instruction to perform the truncation of a v4i32
-  // into the lower half of a v8i16, in the same way that a NEON vmovn would.
-  // Most of the instructions in MVE follow the 'Beats' system, where moving
-  // values from different lanes is usually something that the instructions
-  // avoid.
-  //
-  // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
-  // which take a the top/bottom half of a larger lane and extend it (or do the
-  // opposite, truncating into the top/bottom lane from a larger lane). Note
-  // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
-  // bottom 16bits from each vector lane. This works really well with T/B
-  // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
-  // to move order.
-  //
-  // But truncates and sext/zext are always going to be fairly common from llvm.
-  // We have several options for how to deal with them:
-  // - Wherever possible combine them into an instruction that makes them
-  //   "free". This includes loads/stores, which can perform the trunc as part
-  //   of the memory operation. Or certain shuffles that can be turned into
-  //   VMOVN/VMOVL.
-  // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
-  //   trunc(mul(sext(a), sext(b))) may become
-  //   VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
-  //   this case can use VMULL). This is performed in the
-  //   MVELaneInterleavingPass.
-  // - Otherwise we have an option. By default we would expand the
-  //   zext/sext/trunc into a series of lane extract/inserts going via GPR
-  //   registers. One for each vector lane in the vector. This can obviously be
-  //   very expensive.
-  // - The other option is to use the fact that loads/store can extend/truncate
-  //   to turn a trunc into two truncating stack stores and a stack reload. This
-  //   becomes 3 back-to-back memory operations, but at least that is less than
-  //   all the insert/extracts.
-  //
-  // In order to do the last, we convert certain trunc's into MVETRUNC, which
-  // are either optimized where they can be, or eventually lowered into stack
-  // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
-  // two early, where other instructions would be better, and stops us from
-  // having to reconstruct multiple buildvector shuffles into loads/stores.
-  if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
-    return SDValue();
-  EVT FromVT = N->getOperand(0).getValueType();
-  if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
-    return SDValue();
-
-  SDValue Lo, Hi;
-  std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
-  SDLoc DL(N);
-  return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
-}
-
-static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG,
-                                 const ARMSubtarget *Subtarget) {
-  if (!Subtarget->hasMVEIntegerOps())
-    return SDValue();
-
-  // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
-
-  EVT ToVT = N->getValueType(0);
-  if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
-    return SDValue();
-  SDValue Op = N->getOperand(0);
-  EVT FromVT = Op.getValueType();
-  if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
-    return SDValue();
-
-  SDLoc DL(N);
-  EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
-  if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
-    ExtVT = MVT::v8i16;
-
-  unsigned Opcode =
-      N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT;
-  SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
-  SDValue Ext1 = Ext.getValue(1);
-
-  if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
-    Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
-    Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
-  }
-
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
-}
-
 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
 /// element has been zero/sign-extended, depending on the isSigned parameter,
 /// from an integer type half its size.
@@ -9349,7 +8379,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
         return true;
     } else {
-      if (Hi0->isZero() && Hi1->isZero())
+      if (Hi0->isNullValue() && Hi1->isNullValue())
         return true;
     }
     return false;
@@ -9388,11 +8418,10 @@ static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
   return false;
 }
 
-/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
-/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
+/// isZeroExtended - Check if a node is a vector value that is zero-extended
+/// or a constant BUILD_VECTOR with zero-extended elements.
 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
-  if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
-      ISD::isZEXTLoad(N))
+  if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
     return true;
   if (isExtendedBUILD_VECTOR(N, DAG, false))
     return true;
@@ -9447,27 +8476,26 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
   // The load already has the right type.
   if (ExtendedTy == LD->getMemoryVT())
     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
-                       LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
-                       LD->getMemOperand()->getFlags());
+                       LD->getBasePtr(), LD->getPointerInfo(),
+                       LD->getAlignment(), LD->getMemOperand()->getFlags());
 
   // We need to create a zextload/sextload. We cannot just create a load
   // followed by a zext/zext node because LowerMUL is also run during normal
   // operation legalization where we can't create illegal types.
   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
-                        LD->getMemoryVT(), LD->getAlign(),
+                        LD->getMemoryVT(), LD->getAlignment(),
                         LD->getMemOperand()->getFlags());
 }
 
 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
-/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
-/// the unextended value. The unextended vector should be 64 bits so that it can
+/// extending load, or BUILD_VECTOR with extended elements, return the
+/// unextended value. The unextended vector should be 64 bits so that it can
 /// be used as an operand to a VMULL instruction. If the original vector size
 /// before extension is less than 64 bits we add a an extension to resize
 /// the vector to 64 bits.
 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
-  if (N->getOpcode() == ISD::SIGN_EXTEND ||
-      N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
+  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
                                         N->getOperand(0)->getValueType(0),
                                         N->getValueType(0),
@@ -9864,7 +8892,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   if (ShouldUseSRet) {
     // Create stack object for sret.
     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
-    const Align StackAlign = DL.getPrefTypeAlign(RetTy);
+    const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
     SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
 
@@ -9964,7 +8992,7 @@ ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
   if (N->getOpcode() != ISD::SDIV)
     return SDValue();
 
-  const auto &ST = DAG.getSubtarget<ARMSubtarget>();
+  const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
   const bool MinSize = ST.hasMinSize();
   const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
                                       : ST.hasDivideInARMMode();
@@ -10039,136 +9067,69 @@ void ARMTargetLowering::ExpandDIV_Windows(
                               DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
 
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
+  Results.push_back(Lower);
+  Results.push_back(Upper);
 }
 
 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
   LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
   EVT MemVT = LD->getMemoryVT();
-  assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
-          MemVT == MVT::v16i1) &&
+  assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
          "Expected a predicate type!");
   assert(MemVT == Op.getValueType());
   assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
          "Expected a non-extending load");
   assert(LD->isUnindexed() && "Expected a unindexed load");
 
-  // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
+  // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
   // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
-  // need to make sure that 8/4/2 bits are actually loaded into the correct
+  // need to make sure that 8/4 bits are actually loaded into the correct
   // place, which means loading the value and then shuffling the values into
   // the bottom bits of the predicate.
   // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
   // for BE).
-  // Speaking of BE, apparently the rest of llvm will assume a reverse order to
-  // a natural VMSR(load), so needs to be reversed.
 
   SDLoc dl(Op);
   SDValue Load = DAG.getExtLoad(
       ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
       EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
       LD->getMemOperand());
-  SDValue Val = Load;
-  if (DAG.getDataLayout().isBigEndian())
-    Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
-                      DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
-                      DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
-  SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
+  SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load);
   if (MemVT != MVT::v16i1)
     Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
                        DAG.getConstant(0, dl, MVT::i32));
   return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
 }
 
-void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) const {
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  EVT MemVT = LD->getMemoryVT();
-  assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
-
-  if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
-      !Subtarget->isThumb1Only() && LD->isVolatile()) {
-    SDLoc dl(N);
-    SDValue Result = DAG.getMemIntrinsicNode(
-        ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
-        {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
-    SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
-    SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
-    SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
-    Results.append({Pair, Result.getValue(2)});
-  }
-}
-
 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
   StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
   EVT MemVT = ST->getMemoryVT();
-  assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
-          MemVT == MVT::v16i1) &&
+  assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
          "Expected a predicate type!");
   assert(MemVT == ST->getValue().getValueType());
   assert(!ST->isTruncatingStore() && "Expected a non-extending store");
   assert(ST->isUnindexed() && "Expected a unindexed store");
 
-  // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
-  // top bits unset and a scalar store.
+  // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
+  // unset and a scalar store.
   SDLoc dl(Op);
   SDValue Build = ST->getValue();
   if (MemVT != MVT::v16i1) {
     SmallVector<SDValue, 16> Ops;
-    for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
-      unsigned Elt = DAG.getDataLayout().isBigEndian()
-                         ? MemVT.getVectorNumElements() - I - 1
-                         : I;
+    for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++)
       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
-                                DAG.getConstant(Elt, dl, MVT::i32)));
-    }
+                                DAG.getConstant(I, dl, MVT::i32)));
     for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
       Ops.push_back(DAG.getUNDEF(MVT::i32));
     Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
   }
   SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
-  if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
-    GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
-                      DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
-                      DAG.getConstant(16, dl, MVT::i32));
   return DAG.getTruncStore(
       ST->getChain(), dl, GRP, ST->getBasePtr(),
       EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
       ST->getMemOperand());
 }
 
-static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
-                          const ARMSubtarget *Subtarget) {
-  StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
-  EVT MemVT = ST->getMemoryVT();
-  assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
-
-  if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
-      !Subtarget->isThumb1Only() && ST->isVolatile()) {
-    SDNode *N = Op.getNode();
-    SDLoc dl(N);
-
-    SDValue Lo = DAG.getNode(
-        ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
-        DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
-                              MVT::i32));
-    SDValue Hi = DAG.getNode(
-        ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
-        DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
-                              MVT::i32));
-
-    return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
-                                   {ST->getChain(), Lo, Hi, ST->getBasePtr()},
-                                   MemVT, ST->getMemOperand());
-  } else if (Subtarget->hasMVEIntegerOps() &&
-             ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
-               MemVT == MVT::v16i1))) {
-    return LowerPredicateStore(Op, DAG);
-  }
-
-  return SDValue();
-}
-
 static bool isZeroVector(SDValue N) {
   return (ISD::isBuildVectorAllZeros(N.getNode()) ||
           (N->getOpcode() == ARMISD::VMOVIMM &&
@@ -10194,89 +9155,15 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
       N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
       N->getExtensionType(), N->isExpandingLoad());
   SDValue Combo = NewLoad;
-  bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
-                             PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
-                            isZeroVector(PassThru->getOperand(0));
-  if (!PassThru.isUndef() && !PassThruIsCastZero)
+  if (!PassThru.isUndef() &&
+      (PassThru.getOpcode() != ISD::BITCAST ||
+       !isZeroVector(PassThru->getOperand(0))))
     Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
   return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
 }
 
-static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG,
-                              const ARMSubtarget *ST) {
-  if (!ST->hasMVEIntegerOps())
-    return SDValue();
-
-  SDLoc dl(Op);
-  unsigned BaseOpcode = 0;
-  switch (Op->getOpcode()) {
-  default: llvm_unreachable("Expected VECREDUCE opcode");
-  case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
-  case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
-  case ISD::VECREDUCE_MUL:  BaseOpcode = ISD::MUL; break;
-  case ISD::VECREDUCE_AND:  BaseOpcode = ISD::AND; break;
-  case ISD::VECREDUCE_OR:   BaseOpcode = ISD::OR; break;
-  case ISD::VECREDUCE_XOR:  BaseOpcode = ISD::XOR; break;
-  case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
-  case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
-  }
-
-  SDValue Op0 = Op->getOperand(0);
-  EVT VT = Op0.getValueType();
-  EVT EltVT = VT.getVectorElementType();
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumActiveLanes = NumElts;
-
-  assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
-          NumActiveLanes == 2) &&
-         "Only expected a power 2 vector size");
-
-  // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
-  // allows us to easily extract vector elements from the lanes.
-  while (NumActiveLanes > 4) {
-    unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
-    SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
-    Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
-    NumActiveLanes /= 2;
-  }
-
-  SDValue Res;
-  if (NumActiveLanes == 4) {
-    // The remaining 4 elements are summed sequentially
-    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
-                              DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
-    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
-                              DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
-    SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
-                              DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
-    SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
-                              DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
-    SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
-    SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
-    Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
-  } else {
-    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
-                              DAG.getConstant(0, dl, MVT::i32));
-    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
-                              DAG.getConstant(1, dl, MVT::i32));
-    Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
-  }
-
-  // Result type may be wider than element type.
-  if (EltVT != Op->getValueType(0))
-    Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
-  return Res;
-}
-
-static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
-                               const ARMSubtarget *ST) {
-  if (!ST->hasMVEFloatOps())
-    return SDValue();
-  return LowerVecReduce(Op, DAG, ST);
-}
-
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
-  if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
+  if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
     // Acquire/Release load/store is not legal for targets without a dmb or
     // equivalent available.
     return SDValue();
@@ -10344,13 +9231,12 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N,
 
   bool isBigEndian = DAG.getDataLayout().isBigEndian();
 
-  SDValue Lo =
+  Results.push_back(
       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
-                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
-  SDValue Hi =
+                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
+  Results.push_back(
       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
-                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
+                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
   Results.push_back(SDValue(CmpSwap, 2));
 }
 
@@ -10399,15 +9285,6 @@ SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues({Result, Chain}, dl);
 }
 
-SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-
-  EVT VT = getPointerTy(DAG.getDataLayout());
-  SDLoc DL(Op);
-  int FI = MFI.CreateFixedObject(4, 0, false);
-  return DAG.getFrameIndex(FI, VT);
-}
-
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
   switch (Op.getOpcode()) {
@@ -10431,8 +9308,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
-  case ISD::FP_TO_SINT_SAT:
-  case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
@@ -10463,11 +9338,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
-  case ISD::TRUNCATE:      return LowerTruncate(Op.getNode(), DAG, Subtarget);
-  case ISD::SIGN_EXTEND:
-  case ISD::ZERO_EXTEND:   return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
-  case ISD::GET_ROUNDING:  return LowerGET_ROUNDING(Op, DAG);
-  case ISD::SET_ROUNDING:  return LowerSET_ROUNDING(Op, DAG);
+  case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:           return LowerMUL(Op, DAG);
   case ISD::SDIV:
     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
@@ -10487,25 +9358,13 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerUnsignedALUO(Op, DAG);
   case ISD::SADDSAT:
   case ISD::SSUBSAT:
-  case ISD::UADDSAT:
-  case ISD::USUBSAT:
-    return LowerADDSUBSAT(Op, DAG, Subtarget);
+    return LowerSADDSUBSAT(Op, DAG, Subtarget);
   case ISD::LOAD:
     return LowerPredicateLoad(Op, DAG);
   case ISD::STORE:
-    return LowerSTORE(Op, DAG, Subtarget);
+    return LowerPredicateStore(Op, DAG);
   case ISD::MLOAD:
     return LowerMLOAD(Op, DAG);
-  case ISD::VECREDUCE_MUL:
-  case ISD::VECREDUCE_AND:
-  case ISD::VECREDUCE_OR:
-  case ISD::VECREDUCE_XOR:
-    return LowerVecReduce(Op, DAG, Subtarget);
-  case ISD::VECREDUCE_FADD:
-  case ISD::VECREDUCE_FMUL:
-  case ISD::VECREDUCE_FMIN:
-  case ISD::VECREDUCE_FMAX:
-    return LowerVecReduceF(Op, DAG, Subtarget);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
@@ -10521,8 +9380,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
-  case ISD::SPONENTRY:
-    return LowerSPONENTRY(Op, DAG);
   case ARMISD::WIN__DBZCHK: return SDValue();
   }
 }
@@ -10554,8 +9411,8 @@ static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                 DAG.getVTList(MVT::i32, MVT::i32),
                                 N->getOperand(1), N->getOperand(2),
                                 Lo, Hi);
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
-                                LongMul.getValue(0), LongMul.getValue(1)));
+  Results.push_back(LongMul.getValue(0));
+  Results.push_back(LongMul.getValue(1));
 }
 
 /// ReplaceNodeResults - Replace the results of node with an illegal result
@@ -10591,9 +9448,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   case ISD::SADDSAT:
   case ISD::SSUBSAT:
-  case ISD::UADDSAT:
-  case ISD::USUBSAT:
-    Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
+    Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
     break;
   case ISD::READCYCLECOUNTER:
     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
@@ -10608,20 +9463,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   case ISD::INTRINSIC_WO_CHAIN:
     return ReplaceLongIntrinsic(N, Results, DAG);
-  case ISD::LOAD:
-    LowerLOAD(N, Results, DAG);
-    break;
-  case ISD::TRUNCATE:
-    Res = LowerTruncate(N, DAG, Subtarget);
-    break;
-  case ISD::SIGN_EXTEND:
-  case ISD::ZERO_EXTEND:
-    Res = LowerVectorExtend(N, DAG, Subtarget);
-    break;
-  case ISD::FP_TO_SINT_SAT:
-  case ISD::FP_TO_UINT_SAT:
-    Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
-    break;
+  case ISD::ABS:
+     lowerABS(N, Results, DAG);
+     return ;
+
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -10654,7 +9499,7 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
   ARMConstantPoolValue *CPV =
     ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
-  unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
+  unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
 
   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
                                            : &ARM::GPRRegClass;
@@ -10662,11 +9507,11 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
   // Grab constant pool and fixed stack memory operands.
   MachineMemOperand *CPMMO =
       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
-                               MachineMemOperand::MOLoad, 4, Align(4));
+                               MachineMemOperand::MOLoad, 4, 4);
 
   MachineMemOperand *FIMMOSt =
       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
-                               MachineMemOperand::MOStore, 4, Align(4));
+                               MachineMemOperand::MOStore, 4, 4);
 
   // Load the address of the dispatch MBB into the jump buffer.
   if (isThumb2) {
@@ -10777,23 +9622,25 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   // associated with.
   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
   unsigned MaxCSNum = 0;
-  for (MachineBasicBlock &BB : *MF) {
-    if (!BB.isEHPad())
-      continue;
+  for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
+       ++BB) {
+    if (!BB->isEHPad()) continue;
 
     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
     // pad.
-    for (MachineInstr &II : BB) {
-      if (!II.isEHLabel())
-        continue;
+    for (MachineBasicBlock::iterator
+           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
+      if (!II->isEHLabel()) continue;
 
-      MCSymbol *Sym = II.getOperand(0).getMCSymbol();
+      MCSymbol *Sym = II->getOperand(0).getMCSymbol();
       if (!MF->hasCallSiteLandingPad(Sym)) continue;
 
       SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
-      for (unsigned Idx : CallSiteIdxs) {
-        CallSiteNumToLPad[Idx].push_back(&BB);
-        MaxCSNum = std::max(MaxCSNum, Idx);
+      for (SmallVectorImpl<unsigned>::iterator
+             CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
+           CSI != CSE; ++CSI) {
+        CallSiteNumToLPad[*CSI].push_back(&*BB);
+        MaxCSNum = std::max(MaxCSNum, *CSI);
       }
       break;
     }
@@ -10805,9 +9652,10 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   LPadList.reserve(CallSiteNumToLPad.size());
   for (unsigned I = 1; I <= MaxCSNum; ++I) {
     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
-    for (MachineBasicBlock *MBB : MBBList) {
-      LPadList.push_back(MBB);
-      InvokeBBs.insert(MBB->pred_begin(), MBB->pred_end());
+    for (SmallVectorImpl<MachineBasicBlock*>::iterator
+           II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
+      LPadList.push_back(*II);
+      InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
     }
   }
 
@@ -10849,7 +9697,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
       MachinePointerInfo::getFixedStack(*MF, FI),
-      MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4));
+      MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
 
   MachineInstrBuilder MIB;
   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
@@ -10940,8 +9788,10 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
-      Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
-      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
+      unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
+      if (Align == 0)
+        Align = MF->getDataLayout().getTypeAllocSize(C->getType());
+      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       Register VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
@@ -10978,9 +9828,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
         .addReg(NewVReg3)
         .add(predOps(ARMCC::AL));
 
-    MachineMemOperand *JTMMOLd =
-        MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
-                                 MachineMemOperand::MOLoad, 4, Align(4));
+    MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
+        MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
 
     Register NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
@@ -11040,8 +9889,10 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
-      Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
-      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
+      unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
+      if (Align == 0)
+        Align = MF->getDataLayout().getTypeAllocSize(C->getType());
+      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       Register VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
@@ -11071,9 +9922,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
         .addJumpTableIndex(MJTI)
         .add(predOps(ARMCC::AL));
 
-    MachineMemOperand *JTMMOLd =
-        MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
-                                 MachineMemOperand::MOLoad, 4, Align(4));
+    MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
+        MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
     Register NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
         .addReg(NewVReg3, RegState::Kill)
@@ -11096,7 +9946,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
   // Add the jump table entries as successors to the MBB.
   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
-  for (MachineBasicBlock *CurMBB : LPadList) {
+  for (std::vector<MachineBasicBlock*>::iterator
+         I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
+    MachineBasicBlock *CurMBB = *I;
     if (SeenMBBs.insert(CurMBB).second)
       DispContBB->addSuccessor(CurMBB);
   }
@@ -11108,7 +9960,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
     // Remove the landing pad successor from the invoke block and replace it
     // with the new dispatch block.
-    SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
+    SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
+                                                  BB->succ_end());
     while (!Successors.empty()) {
       MachineBasicBlock *SMBB = Successors.pop_back_val();
       if (SMBB->isEHPad()) {
@@ -11158,8 +10011,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
   // Mark all former landing pads as non-landing pads. The dispatch is the only
   // landing pad now.
-  for (MachineBasicBlock *MBBLPad : MBBLPads)
-    MBBLPad->setIsEHPad(false);
+  for (SmallVectorImpl<MachineBasicBlock*>::iterator
+         I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
+    (*I)->setIsEHPad(false);
 
   // The instruction is gone now.
   MI.eraseFromParent();
@@ -11167,9 +10021,10 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
 static
 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
-  for (MachineBasicBlock *S : MBB->successors())
-    if (S != Succ)
-      return S;
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I != Succ)
+      return *I;
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
@@ -11307,7 +10162,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
   Register dest = MI.getOperand(0).getReg();
   Register src = MI.getOperand(1).getReg();
   unsigned SizeVal = MI.getOperand(2).getImm();
-  unsigned Alignment = MI.getOperand(3).getImm();
+  unsigned Align = MI.getOperand(3).getImm();
   DebugLoc dl = MI.getDebugLoc();
 
   MachineFunction *MF = BB->getParent();
@@ -11320,17 +10175,17 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
   bool IsThumb2 = Subtarget->isThumb2();
   bool IsThumb = Subtarget->isThumb();
 
-  if (Alignment & 1) {
+  if (Align & 1) {
     UnitSize = 1;
-  } else if (Alignment & 2) {
+  } else if (Align & 2) {
     UnitSize = 2;
   } else {
     // Check whether we can use NEON instructions.
     if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
         Subtarget->hasNEON()) {
-      if ((Alignment % 16 == 0) && SizeVal >= 16)
+      if ((Align % 16 == 0) && SizeVal >= 16)
         UnitSize = 16;
-      else if ((Alignment % 8 == 0) && SizeVal >= 8)
+      else if ((Align % 8 == 0) && SizeVal >= 8)
         UnitSize = 8;
     }
     // Can't use NEON instructions.
@@ -11436,11 +10291,13 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
 
     // MachineConstantPool wants an explicit alignment.
-    Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
-    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
+    unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
+    if (Align == 0)
+      Align = MF->getDataLayout().getTypeAllocSize(C->getType());
+    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
     MachineMemOperand *CPMMO =
         MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
-                                 MachineMemOperand::MOLoad, 4, Align(4));
+                                 MachineMemOperand::MOLoad, 4, 4);
 
     if (IsThumb)
       BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
@@ -11590,7 +10447,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
 
     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
       .addExternalSymbol("__chkstk");
-    BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent())))
+    BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
         .add(predOps(ARMCC::AL))
         .addReg(Reg, RegState::Kill)
         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
@@ -11667,9 +10524,13 @@ static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
   // If we hit the end of the block, check whether CPSR is live into a
   // successor.
   if (miI == BB->end()) {
-    for (MachineBasicBlock *Succ : BB->successors())
-      if (Succ->isLiveIn(ARM::CPSR))
+    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                          sEnd = BB->succ_end();
+         sItr != sEnd; ++sItr) {
+      MachineBasicBlock* succ = *sItr;
+      if (succ->isLiveIn(ARM::CPSR))
         return false;
+    }
   }
 
   // We found a def, or hit the end of the basic block and CPSR wasn't live
@@ -11678,148 +10539,6 @@ static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
   return true;
 }
 
-/// Adds logic in loop entry MBB to calculate loop iteration count and adds
-/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
-static Register genTPEntry(MachineBasicBlock *TpEntry,
-                           MachineBasicBlock *TpLoopBody,
-                           MachineBasicBlock *TpExit, Register OpSizeReg,
-                           const TargetInstrInfo *TII, DebugLoc Dl,
-                           MachineRegisterInfo &MRI) {
-  // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
-  Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
-      .addUse(OpSizeReg)
-      .addImm(15)
-      .add(predOps(ARMCC::AL))
-      .addReg(0);
-
-  Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
-      .addUse(AddDestReg, RegState::Kill)
-      .addImm(4)
-      .add(predOps(ARMCC::AL))
-      .addReg(0);
-
-  Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
-  BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
-      .addUse(LsrDestReg, RegState::Kill);
-
-  BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
-      .addUse(TotalIterationsReg)
-      .addMBB(TpExit);
-
-  BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
-      .addMBB(TpLoopBody)
-      .add(predOps(ARMCC::AL));
-
-  return TotalIterationsReg;
-}
-
-/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
-/// t2DoLoopEnd. These are used by later passes to generate tail predicated
-/// loops.
-static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
-                          MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
-                          const TargetInstrInfo *TII, DebugLoc Dl,
-                          MachineRegisterInfo &MRI, Register OpSrcReg,
-                          Register OpDestReg, Register ElementCountReg,
-                          Register TotalIterationsReg, bool IsMemcpy) {
-  // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
-  // array, loop iteration counter, predication counter.
-
-  Register SrcPhiReg, CurrSrcReg;
-  if (IsMemcpy) {
-    //  Current position in the src array
-    SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-    CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-    BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
-        .addUse(OpSrcReg)
-        .addMBB(TpEntry)
-        .addUse(CurrSrcReg)
-        .addMBB(TpLoopBody);
-  }
-
-  // Current position in the dest array
-  Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
-      .addUse(OpDestReg)
-      .addMBB(TpEntry)
-      .addUse(CurrDestReg)
-      .addMBB(TpLoopBody);
-
-  // Current loop counter
-  Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
-  Register RemainingLoopIterationsReg =
-      MRI.createVirtualRegister(&ARM::GPRlrRegClass);
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
-      .addUse(TotalIterationsReg)
-      .addMBB(TpEntry)
-      .addUse(RemainingLoopIterationsReg)
-      .addMBB(TpLoopBody);
-
-  // Predication counter
-  Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
-      .addUse(ElementCountReg)
-      .addMBB(TpEntry)
-      .addUse(RemainingElementsReg)
-      .addMBB(TpLoopBody);
-
-  // Pass predication counter to VCTP
-  Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
-      .addUse(PredCounterPhiReg)
-      .addImm(ARMVCC::None)
-      .addReg(0)
-      .addReg(0);
-
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
-      .addUse(PredCounterPhiReg)
-      .addImm(16)
-      .add(predOps(ARMCC::AL))
-      .addReg(0);
-
-  // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
-  Register SrcValueReg;
-  if (IsMemcpy) {
-    SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
-    BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
-        .addDef(CurrSrcReg)
-        .addDef(SrcValueReg)
-        .addReg(SrcPhiReg)
-        .addImm(16)
-        .addImm(ARMVCC::Then)
-        .addUse(VccrReg)
-        .addReg(0);
-  } else
-    SrcValueReg = OpSrcReg;
-
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
-      .addDef(CurrDestReg)
-      .addUse(SrcValueReg)
-      .addReg(DestPhiReg)
-      .addImm(16)
-      .addImm(ARMVCC::Then)
-      .addUse(VccrReg)
-      .addReg(0);
-
-  // Add the pseudoInstrs for decrementing the loop counter and marking the
-  // end:t2DoLoopDec and t2DoLoopEnd
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
-      .addUse(LoopCounterPhiReg)
-      .addImm(1);
-
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
-      .addUse(RemainingLoopIterationsReg)
-      .addMBB(TpLoopBody);
-
-  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
-      .addMBB(TpExit)
-      .add(predOps(ARMCC::AL));
-}
-
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -11846,98 +10565,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return BB;
   }
 
-  case ARM::MVE_MEMCPYLOOPINST:
-  case ARM::MVE_MEMSETLOOPINST: {
-
-    // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
-    // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
-    // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
-    // adds the relevant instructions in the TP loop Body for generation of a
-    // WLSTP loop.
-
-    // Below is relevant portion of the CFG after the transformation.
-    // The Machine Basic Blocks are shown along with branch conditions (in
-    // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
-    // portion of the CFG and may not necessarily be the entry/exit of the
-    // function.
-
-    //             (Relevant) CFG after transformation:
-    //               TP entry MBB
-    //                   |
-    //          |-----------------|
-    //       (n <= 0)          (n > 0)
-    //          |                 |
-    //          |         TP loop Body MBB<--|
-    //          |                |           |
-    //           \               |___________|
-    //            \             /
-    //              TP exit MBB
-
-    MachineFunction *MF = BB->getParent();
-    MachineFunctionProperties &Properties = MF->getProperties();
-    MachineRegisterInfo &MRI = MF->getRegInfo();
-
-    Register OpDestReg = MI.getOperand(0).getReg();
-    Register OpSrcReg = MI.getOperand(1).getReg();
-    Register OpSizeReg = MI.getOperand(2).getReg();
-
-    // Allocate the required MBBs and add to parent function.
-    MachineBasicBlock *TpEntry = BB;
-    MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
-    MachineBasicBlock *TpExit;
-
-    MF->push_back(TpLoopBody);
-
-    // If any instructions are present in the current block after
-    // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
-    // move the instructions into the newly created exit block. If there are no
-    // instructions add an explicit branch to the FallThrough block and then
-    // split.
-    //
-    // The split is required for two reasons:
-    // 1) A terminator(t2WhileLoopStart) will be placed at that site.
-    // 2) Since a TPLoopBody will be added later, any phis in successive blocks
-    //    need to be updated. splitAt() already handles this.
-    TpExit = BB->splitAt(MI, false);
-    if (TpExit == BB) {
-      assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
-                                     "block containing memcpy/memset Pseudo");
-      TpExit = BB->getFallThrough();
-      BuildMI(BB, dl, TII->get(ARM::t2B))
-          .addMBB(TpExit)
-          .add(predOps(ARMCC::AL));
-      TpExit = BB->splitAt(MI, false);
-    }
-
-    // Add logic for iteration count
-    Register TotalIterationsReg =
-        genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
-
-    // Add the vectorized (and predicated) loads/store instructions
-    bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
-    genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
-                  OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
-
-    // Required to avoid conflict with the MachineVerifier during testing.
-    Properties.reset(MachineFunctionProperties::Property::NoPHIs);
-
-    // Connect the blocks
-    TpEntry->addSuccessor(TpLoopBody);
-    TpLoopBody->addSuccessor(TpLoopBody);
-    TpLoopBody->addSuccessor(TpExit);
-
-    // Reorder for a more natural layout
-    TpLoopBody->moveAfter(TpEntry);
-    TpExit->moveAfter(TpLoopBody);
-
-    // Finally, remove the memcpy Psuedo Instruction
-    MI.eraseFromParent();
-
-    // Return the exit block as it may contain other instructions requiring a
-    // custom inserter
-    return TpExit;
-  }
-
   // The Thumb2 pre-indexed stores have the same MI operands, they just
   // define them differently in the .td files from the isel patterns, so
   // they need pseudos.
@@ -11985,8 +10612,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
     }
     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
-    for (const MachineOperand &MO : MI.operands())
-      MIB.add(MO);
+    for (unsigned i = 0; i < MI.getNumOperands(); ++i)
+      MIB.add(MI.getOperand(i));
     MI.eraseFromParent();
     return BB;
   }
@@ -12266,7 +10893,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
     if (Subtarget->isThumb1Only()) {
       for (unsigned c = MCID->getNumOperands() - 4; c--;) {
         MI.addOperand(MI.getOperand(1));
-        MI.removeOperand(1);
+        MI.RemoveOperand(1);
       }
 
       // Restore the ties
@@ -12289,7 +10916,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
 
   // Any ARM instruction that sets the 's' bit should specify an optional
   // "cc_out" operand in the last operand position.
-  if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
+  if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
     assert(!NewOpc && "Optional cc_out operand required");
     return;
   }
@@ -12304,7 +10931,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
       definesCPSR = true;
       if (MO.isDead())
         deadCPSR = true;
-      MI.removeOperand(i);
+      MI.RemoveOperand(i);
       break;
     }
   }
@@ -12375,7 +11002,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
     // (zext cc) can never be the all ones value.
     if (AllOnes)
       return false;
-    [[fallthrough]];
+    LLVM_FALLTHROUGH;
   case ISD::SIGN_EXTEND: {
     SDLoc dl(N);
     EVT VT = N->getValueType(0);
@@ -12391,7 +11018,8 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
       // When looking for a 0 constant, N can be zext or sext.
       OtherOp = DAG.getConstant(1, dl, VT);
     else
-      OtherOp = DAG.getAllOnesConstant(dl, VT);
+      OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
+                                VT);
     return true;
   }
   }
@@ -12983,7 +11611,7 @@ static SDValue PerformAddcSubcCombine(SDNode *N,
                                       const ARMSubtarget *Subtarget) {
   SelectionDAG &DAG(DCI.DAG);
 
-  if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
+  if (N->getOpcode() == ARMISD::SUBC) {
     // (SUBC (ADDE 0, 0, C), 1) -> C
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
@@ -13039,333 +11667,20 @@ static SDValue PerformAddeSubeCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue PerformSELECTCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    const ARMSubtarget *Subtarget) {
-  if (!Subtarget->hasMVEIntegerOps())
-    return SDValue();
-
-  SDLoc dl(N);
-  SDValue SetCC;
-  SDValue LHS;
-  SDValue RHS;
-  ISD::CondCode CC;
-  SDValue TrueVal;
-  SDValue FalseVal;
-
-  if (N->getOpcode() == ISD::SELECT &&
-      N->getOperand(0)->getOpcode() == ISD::SETCC) {
-    SetCC = N->getOperand(0);
-    LHS = SetCC->getOperand(0);
-    RHS = SetCC->getOperand(1);
-    CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
-    TrueVal = N->getOperand(1);
-    FalseVal = N->getOperand(2);
-  } else if (N->getOpcode() == ISD::SELECT_CC) {
-    LHS = N->getOperand(0);
-    RHS = N->getOperand(1);
-    CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
-    TrueVal = N->getOperand(2);
-    FalseVal = N->getOperand(3);
-  } else {
-    return SDValue();
-  }
-
-  unsigned int Opcode = 0;
-  if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
-       FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
-      (CC == ISD::SETULT || CC == ISD::SETUGT)) {
-    Opcode = ARMISD::VMINVu;
-    if (CC == ISD::SETUGT)
-      std::swap(TrueVal, FalseVal);
-  } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
-              FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
-             (CC == ISD::SETLT || CC == ISD::SETGT)) {
-    Opcode = ARMISD::VMINVs;
-    if (CC == ISD::SETGT)
-      std::swap(TrueVal, FalseVal);
-  } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
-              FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
-             (CC == ISD::SETUGT || CC == ISD::SETULT)) {
-    Opcode = ARMISD::VMAXVu;
-    if (CC == ISD::SETULT)
-      std::swap(TrueVal, FalseVal);
-  } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
-              FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
-             (CC == ISD::SETGT || CC == ISD::SETLT)) {
-    Opcode = ARMISD::VMAXVs;
-    if (CC == ISD::SETLT)
-      std::swap(TrueVal, FalseVal);
-  } else
-    return SDValue();
-
-  // Normalise to the right hand side being the vector reduction
-  switch (TrueVal->getOpcode()) {
-  case ISD::VECREDUCE_UMIN:
-  case ISD::VECREDUCE_SMIN:
-  case ISD::VECREDUCE_UMAX:
-  case ISD::VECREDUCE_SMAX:
-    std::swap(LHS, RHS);
-    std::swap(TrueVal, FalseVal);
-    break;
-  }
-
-  EVT VectorType = FalseVal->getOperand(0).getValueType();
-
-  if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
-      VectorType != MVT::v4i32)
-    return SDValue();
-
-  EVT VectorScalarType = VectorType.getVectorElementType();
-
-  // The values being selected must also be the ones being compared
-  if (TrueVal != LHS || FalseVal != RHS)
-    return SDValue();
-
-  EVT LeftType = LHS->getValueType(0);
-  EVT RightType = RHS->getValueType(0);
-
-  // The types must match the reduced type too
-  if (LeftType != VectorScalarType || RightType != VectorScalarType)
-    return SDValue();
-
-  // Legalise the scalar to an i32
-  if (VectorScalarType != MVT::i32)
-    LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
-
-  // Generate the reduction as an i32 for legalisation purposes
-  auto Reduction =
-      DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
-
-  // The result isn't actually an i32 so truncate it back to its original type
-  if (VectorScalarType != MVT::i32)
-    Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
-
-  return Reduction;
-}
-
-// A special combine for the vqdmulh family of instructions. This is one of the
-// potential set of patterns that could patch this instruction. The base pattern
-// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
-// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
-// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
-// the max is unnecessary.
-static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  SDValue Shft;
-  ConstantSDNode *Clamp;
-
-  if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
-    return SDValue();
-
-  if (N->getOpcode() == ISD::SMIN) {
-    Shft = N->getOperand(0);
-    Clamp = isConstOrConstSplat(N->getOperand(1));
-  } else if (N->getOpcode() == ISD::VSELECT) {
-    // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
-    SDValue Cmp = N->getOperand(0);
-    if (Cmp.getOpcode() != ISD::SETCC ||
-        cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
-        Cmp.getOperand(0) != N->getOperand(1) ||
-        Cmp.getOperand(1) != N->getOperand(2))
-      return SDValue();
-    Shft = N->getOperand(1);
-    Clamp = isConstOrConstSplat(N->getOperand(2));
-  } else
-    return SDValue();
-
-  if (!Clamp)
-    return SDValue();
-
-  MVT ScalarType;
-  int ShftAmt = 0;
-  switch (Clamp->getSExtValue()) {
-  case (1 << 7) - 1:
-    ScalarType = MVT::i8;
-    ShftAmt = 7;
-    break;
-  case (1 << 15) - 1:
-    ScalarType = MVT::i16;
-    ShftAmt = 15;
-    break;
-  case (1ULL << 31) - 1:
-    ScalarType = MVT::i32;
-    ShftAmt = 31;
-    break;
-  default:
-    return SDValue();
-  }
-
-  if (Shft.getOpcode() != ISD::SRA)
-    return SDValue();
-  ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1));
-  if (!N1 || N1->getSExtValue() != ShftAmt)
-    return SDValue();
-
-  SDValue Mul = Shft.getOperand(0);
-  if (Mul.getOpcode() != ISD::MUL)
-    return SDValue();
-
-  SDValue Ext0 = Mul.getOperand(0);
-  SDValue Ext1 = Mul.getOperand(1);
-  if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
-      Ext1.getOpcode() != ISD::SIGN_EXTEND)
-    return SDValue();
-  EVT VecVT = Ext0.getOperand(0).getValueType();
-  if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
-    return SDValue();
-  if (Ext1.getOperand(0).getValueType() != VecVT ||
-      VecVT.getScalarType() != ScalarType ||
-      VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
-    return SDValue();
-
-  SDLoc DL(Mul);
-  unsigned LegalLanes = 128 / (ShftAmt + 1);
-  EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
-  // For types smaller than legal vectors extend to be legal and only use needed
-  // lanes.
-  if (VecVT.getSizeInBits() < 128) {
-    EVT ExtVecVT =
-        MVT::getVectorVT(MVT::getIntegerVT(128 / VecVT.getVectorNumElements()),
-                         VecVT.getVectorNumElements());
-    SDValue Inp0 =
-        DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
-    SDValue Inp1 =
-        DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
-    Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
-    Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
-    SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
-    SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
-    Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
-    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
-  }
-
-  // For larger types, split into legal sized chunks.
-  assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
-  unsigned NumParts = VecVT.getSizeInBits() / 128;
-  SmallVector<SDValue> Parts;
-  for (unsigned I = 0; I < NumParts; ++I) {
-    SDValue Inp0 =
-        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
-                    DAG.getVectorIdxConstant(I * LegalLanes, DL));
-    SDValue Inp1 =
-        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
-                    DAG.getVectorIdxConstant(I * LegalLanes, DL));
-    SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
-    Parts.push_back(VQDMULH);
-  }
-  return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
-                     DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
-}
-
-static SDValue PerformVSELECTCombine(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     const ARMSubtarget *Subtarget) {
-  if (!Subtarget->hasMVEIntegerOps())
-    return SDValue();
-
-  if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
-    return V;
-
-  // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
-  //
-  // We need to re-implement this optimization here as the implementation in the
-  // Target-Independent DAGCombiner does not handle the kind of constant we make
-  // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
-  // good reason, allowing truncation there would break other targets).
-  //
-  // Currently, this is only done for MVE, as it's the only target that benefits
-  // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
-  if (N->getOperand(0).getOpcode() != ISD::XOR)
-    return SDValue();
-  SDValue XOR = N->getOperand(0);
-
-  // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
-  // It is important to check with truncation allowed as the BUILD_VECTORs we
-  // generate in those situations will truncate their operands.
-  ConstantSDNode *Const =
-      isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
-                          /*AllowTruncation*/ true);
-  if (!Const || !Const->isOne())
-    return SDValue();
-
-  // Rewrite into vselect(cond, rhs, lhs).
-  SDValue Cond = XOR->getOperand(0);
-  SDValue LHS = N->getOperand(1);
-  SDValue RHS = N->getOperand(2);
-  EVT Type = N->getValueType(0);
-  return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
-}
-
-// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
-static SDValue PerformVSetCCToVCTPCombine(SDNode *N,
-                                          TargetLowering::DAGCombinerInfo &DCI,
-                                          const ARMSubtarget *Subtarget) {
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
-  EVT VT = N->getValueType(0);
-
-  if (!Subtarget->hasMVEIntegerOps() ||
-      !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
-
-  if (CC == ISD::SETUGE) {
-    std::swap(Op0, Op1);
-    CC = ISD::SETULT;
-  }
-
-  if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
-      Op0.getOpcode() != ISD::BUILD_VECTOR)
-    return SDValue();
-
-  // Check first operand is BuildVector of 0,1,2,...
-  for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
-    if (!Op0.getOperand(I).isUndef() &&
-        !(isa<ConstantSDNode>(Op0.getOperand(I)) &&
-          Op0.getConstantOperandVal(I) == I))
-      return SDValue();
-  }
-
-  // The second is a Splat of Op1S
-  SDValue Op1S = DCI.DAG.getSplatValue(Op1);
-  if (!Op1S)
-    return SDValue();
-
-  unsigned Opc;
-  switch (VT.getVectorNumElements()) {
-  case 2:
-    Opc = Intrinsic::arm_mve_vctp64;
-    break;
-  case 4:
-    Opc = Intrinsic::arm_mve_vctp32;
-    break;
-  case 8:
-    Opc = Intrinsic::arm_mve_vctp16;
-    break;
-  case 16:
-    Opc = Intrinsic::arm_mve_vctp8;
-    break;
-  default:
-    return SDValue();
-  }
-
-  SDLoc DL(N);
-  return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DCI.DAG.getConstant(Opc, DL, MVT::i32),
-                         DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
-}
-
 static SDValue PerformABSCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARMSubtarget *Subtarget) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  SDValue res;
   SelectionDAG &DAG = DCI.DAG;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
     return SDValue();
 
-  return TLI.expandABS(N, DAG);
+  if (!TLI.expandABS(N, res, DAG))
+      return SDValue();
+
+  return res;
 }
 
 /// PerformADDECombine - Target-specific dag combine transform from
@@ -13409,248 +11724,9 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc dl(N);
-
-  auto IsVecReduce = [](SDValue Op) {
-    switch (Op.getOpcode()) {
-    case ISD::VECREDUCE_ADD:
-    case ARMISD::VADDVs:
-    case ARMISD::VADDVu:
-    case ARMISD::VMLAVs:
-    case ARMISD::VMLAVu:
-      return true;
-    }
-    return false;
-  };
-
-  auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
-    // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
-    //   add(add(X, vecreduce(Y)), vecreduce(Z))
-    // to make better use of vaddva style instructions.
-    if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
-        IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
-        !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
-      SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
-      return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
-    }
-    // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
-    //   add(add(add(A, C), reduce(B)), reduce(D))
-    if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
-        N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
-      unsigned N0RedOp = 0;
-      if (!IsVecReduce(N0.getOperand(N0RedOp))) {
-        N0RedOp = 1;
-        if (!IsVecReduce(N0.getOperand(N0RedOp)))
-          return SDValue();
-      }
-
-      unsigned N1RedOp = 0;
-      if (!IsVecReduce(N1.getOperand(N1RedOp)))
-        N1RedOp = 1;
-      if (!IsVecReduce(N1.getOperand(N1RedOp)))
-        return SDValue();
-
-      SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
-                                 N1.getOperand(1 - N1RedOp));
-      SDValue Add1 =
-          DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
-      return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
-    }
-    return SDValue();
-  };
-  if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
-    return R;
-  if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
-    return R;
-
-  // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
-  // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
-  // by ascending load offsets. This can help cores prefetch if the order of
-  // loads is more predictable.
-  auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
-    // Check if two reductions are known to load data where one is before/after
-    // another. Return negative if N0 loads data before N1, positive if N1 is
-    // before N0 and 0 otherwise if nothing is known.
-    auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
-      // Look through to the first operand of a MUL, for the VMLA case.
-      // Currently only looks at the first operand, in the hope they are equal.
-      if (N0.getOpcode() == ISD::MUL)
-        N0 = N0.getOperand(0);
-      if (N1.getOpcode() == ISD::MUL)
-        N1 = N1.getOperand(0);
-
-      // Return true if the two operands are loads to the same object and the
-      // offset of the first is known to be less than the offset of the second.
-      LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
-      LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
-      if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
-          !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
-          Load1->isIndexed())
-        return 0;
-
-      auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
-      auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
-
-      if (!BaseLocDecomp0.getBase() ||
-          BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
-          !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
-        return 0;
-      if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
-        return -1;
-      if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
-        return 1;
-      return 0;
-    };
-
-    SDValue X;
-    if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
-      if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
-        int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
-                                         N0.getOperand(1).getOperand(0));
-        if (IsBefore < 0) {
-          X = N0.getOperand(0);
-          N0 = N0.getOperand(1);
-        } else if (IsBefore > 0) {
-          X = N0.getOperand(1);
-          N0 = N0.getOperand(0);
-        } else
-          return SDValue();
-      } else if (IsVecReduce(N0.getOperand(0))) {
-        X = N0.getOperand(1);
-        N0 = N0.getOperand(0);
-      } else if (IsVecReduce(N0.getOperand(1))) {
-        X = N0.getOperand(0);
-        N0 = N0.getOperand(1);
-      } else
-        return SDValue();
-    } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
-               IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
-      // Note this is backward to how you would expect. We create
-      // add(reduce(load + 16), reduce(load + 0)) so that the
-      // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
-      // the X as VADDV(load + 0)
-      return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
-    } else
-      return SDValue();
-
-    if (!IsVecReduce(N0) || !IsVecReduce(N1))
-      return SDValue();
-
-    if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
-      return SDValue();
-
-    // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
-    SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
-    return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
-  };
-  if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
-    return R;
-  if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
-    return R;
-  return SDValue();
-}
-
-static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
-                                   const ARMSubtarget *Subtarget) {
-  if (!Subtarget->hasMVEIntegerOps())
-    return SDValue();
-
-  if (SDValue R = TryDistrubutionADDVecReduce(N, DAG))
-    return R;
-
-  EVT VT = N->getValueType(0);
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc dl(N);
-
-  if (VT != MVT::i64)
-    return SDValue();
-
-  // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
-  // will look like:
-  //   t1: i32,i32 = ARMISD::VADDLVs x
-  //   t2: i64 = build_pair t1, t1:1
-  //   t3: i64 = add t2, y
-  // Otherwise we try to push the add up above VADDLVAx, to potentially allow
-  // the add to be simplified seperately.
-  // We also need to check for sext / zext and commutitive adds.
-  auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
-                           SDValue NB) {
-    if (NB->getOpcode() != ISD::BUILD_PAIR)
-      return SDValue();
-    SDValue VecRed = NB->getOperand(0);
-    if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
-        VecRed.getResNo() != 0 ||
-        NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
-      return SDValue();
-
-    if (VecRed->getOpcode() == OpcodeA) {
-      // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
-      SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
-                                VecRed.getOperand(0), VecRed.getOperand(1));
-      NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
-    }
-
-    SmallVector<SDValue, 4> Ops;
-    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
-                              DAG.getConstant(0, dl, MVT::i32)));
-    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
-                              DAG.getConstant(1, dl, MVT::i32)));
-    unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
-    for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
-      Ops.push_back(VecRed->getOperand(I));
-    SDValue Red =
-        DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
-    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
-                       SDValue(Red.getNode(), 1));
-  };
-
-  if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
-    return M;
-  if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
-    return M;
-  return SDValue();
-}
-
 bool
 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
                                                  CombineLevel Level) const {
-  assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
-          N->getOpcode() == ISD::SRL) &&
-         "Expected shift op");
-
   if (Level == BeforeLegalizeTypes)
     return true;
 
@@ -13684,38 +11760,8 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
   return false;
 }
 
-bool ARMTargetLowering::isDesirableToCommuteXorWithShift(
-    const SDNode *N) const {
-  assert(N->getOpcode() == ISD::XOR &&
-         (N->getOperand(0).getOpcode() == ISD::SHL ||
-          N->getOperand(0).getOpcode() == ISD::SRL) &&
-         "Expected XOR(SHIFT) pattern");
-
-  // Only commute if the entire NOT mask is a hidden shifted mask.
-  auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
-  if (XorC && ShiftC) {
-    unsigned MaskIdx, MaskLen;
-    if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
-      unsigned ShiftAmt = ShiftC->getZExtValue();
-      unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
-      if (N->getOperand(0).getOpcode() == ISD::SHL)
-        return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
-      return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
-    }
-  }
-
-  return false;
-}
-
 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
     const SDNode *N, CombineLevel Level) const {
-  assert(((N->getOpcode() == ISD::SHL &&
-           N->getOperand(0).getOpcode() == ISD::SRL) ||
-          (N->getOpcode() == ISD::SRL &&
-           N->getOperand(0).getOpcode() == ISD::SHL)) &&
-         "Expected shift-shift mask");
-
   if (!Subtarget->isThumb1Only())
     return true;
 
@@ -13734,26 +11780,6 @@ bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
   return VT.isScalarInteger();
 }
 
-bool ARMTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
-                                             EVT VT) const {
-  if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
-    return false;
-
-  switch (FPVT.getSimpleVT().SimpleTy) {
-  case MVT::f16:
-    return Subtarget->hasVFP2Base();
-  case MVT::f32:
-    return Subtarget->hasVFP2Base();
-  case MVT::f64:
-    return Subtarget->hasFP64();
-  case MVT::v4f32:
-  case MVT::v8f16:
-    return Subtarget->hasMVEFloatOps();
-  default:
-    return false;
-  }
-}
-
 static SDValue PerformSHLSimplify(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const ARMSubtarget *ST) {
@@ -13781,7 +11807,7 @@ static SDValue PerformSHLSimplify(SDNode *N,
     return SDValue();
 
   // Check that all the users could perform the shl themselves.
-  for (auto *U : N->uses()) {
+  for (auto U : N->uses()) {
     switch(U->getOpcode()) {
     default:
       return SDValue();
@@ -13823,13 +11849,10 @@ static SDValue PerformSHLSimplify(SDNode *N,
 
   APInt C2Int = C2->getAPIntValue();
   APInt C1Int = C1ShlC2->getAPIntValue();
-  unsigned C2Width = C2Int.getBitWidth();
-  if (C2Int.uge(C2Width))
-    return SDValue();
-  uint64_t C2Value = C2Int.getZExtValue();
 
   // Check that performing a lshr will not lose any information.
-  APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
+  APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
+                                     C2Int.getBitWidth() - C2->getZExtValue());
   if ((C1Int & Mask) != C1Int)
     return SDValue();
 
@@ -13872,9 +11895,6 @@ static SDValue PerformADDCombine(SDNode *N,
   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
     return Result;
 
-  if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
-    return Result;
-
   // First try with the default operand order.
   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
     return Result;
@@ -13883,26 +11903,6 @@ static SDValue PerformADDCombine(SDNode *N,
   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
 }
 
-// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
-//   providing -X is as cheap as X (currently, just a constant).
-static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) {
-  if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
-    return SDValue();
-  SDValue CSINC = N->getOperand(1);
-  if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
-    return SDValue();
-
-  ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
-  if (!X)
-    return SDValue();
-
-  return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
-                     DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
-                                 CSINC.getOperand(0)),
-                     CSINC.getOperand(1), CSINC.getOperand(2),
-                     CSINC.getOperand(3));
-}
-
 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
 ///
 static SDValue PerformSUBCombine(SDNode *N,
@@ -13916,9 +11916,6 @@ static SDValue PerformSUBCombine(SDNode *N,
     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
       return Result;
 
-  if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
-    return R;
-
   if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
     return SDValue();
 
@@ -13989,86 +11986,18 @@ static SDValue PerformVMULCombine(SDNode *N,
                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
 }
 
-static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG,
-                                      const ARMSubtarget *Subtarget) {
-  EVT VT = N->getValueType(0);
-  if (VT != MVT::v2i64)
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  auto IsSignExt = [&](SDValue Op) {
-    if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
-      return SDValue();
-    EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
-    if (VT.getScalarSizeInBits() == 32)
-      return Op->getOperand(0);
-    return SDValue();
-  };
-  auto IsZeroExt = [&](SDValue Op) {
-    // Zero extends are a little more awkward. At the point we are matching
-    // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
-    // That might be before of after a bitcast depending on how the and is
-    // placed. Because this has to look through bitcasts, it is currently only
-    // supported on LE.
-    if (!Subtarget->isLittle())
-      return SDValue();
-
-    SDValue And = Op;
-    if (And->getOpcode() == ISD::BITCAST)
-      And = And->getOperand(0);
-    if (And->getOpcode() != ISD::AND)
-      return SDValue();
-    SDValue Mask = And->getOperand(1);
-    if (Mask->getOpcode() == ISD::BITCAST)
-      Mask = Mask->getOperand(0);
-
-    if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
-        Mask.getValueType() != MVT::v4i32)
-      return SDValue();
-    if (isAllOnesConstant(Mask->getOperand(0)) &&
-        isNullConstant(Mask->getOperand(1)) &&
-        isAllOnesConstant(Mask->getOperand(2)) &&
-        isNullConstant(Mask->getOperand(3)))
-      return And->getOperand(0);
-    return SDValue();
-  };
-
-  SDLoc dl(N);
-  if (SDValue Op0 = IsSignExt(N0)) {
-    if (SDValue Op1 = IsSignExt(N1)) {
-      SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
-      SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
-      return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
-    }
-  }
-  if (SDValue Op0 = IsZeroExt(N0)) {
-    if (SDValue Op1 = IsZeroExt(N1)) {
-      SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
-      SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
-      return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
-    }
-  }
-
-  return SDValue();
-}
-
 static SDValue PerformMULCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
 
-  EVT VT = N->getValueType(0);
-  if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
-    return PerformMVEVMULLCombine(N, DAG, Subtarget);
-
   if (Subtarget->isThumb1Only())
     return SDValue();
 
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
+  EVT VT = N->getValueType(0);
   if (VT.is64BitVector() || VT.is128BitVector())
     return PerformVMULCombine(N, DCI, Subtarget);
   if (VT != MVT::i32)
@@ -14253,21 +12182,20 @@ static SDValue PerformANDCombine(SDNode *N,
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
-      VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
-  if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
+  if (BVN && Subtarget->hasNEON() &&
       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
-        SplatBitSize == 64) {
+    if (SplatBitSize <= 64) {
       EVT VbicVT;
       SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
                                       SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, dl, VbicVT, VT, OtherModImm);
+                                      DAG, dl, VbicVT, VT.is128BitVector(),
+                                      OtherModImm);
       if (Val.getNode()) {
         SDValue Input =
           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
@@ -14497,43 +12425,58 @@ static bool isValidMVECond(unsigned CC, bool IsFloat) {
   };
 }
 
-static ARMCC::CondCodes getVCMPCondCode(SDValue N) {
-  if (N->getOpcode() == ARMISD::VCMP)
-    return (ARMCC::CondCodes)N->getConstantOperandVal(2);
-  else if (N->getOpcode() == ARMISD::VCMPZ)
-    return (ARMCC::CondCodes)N->getConstantOperandVal(1);
-  else
-    llvm_unreachable("Not a VCMP/VCMPZ!");
-}
-
-static bool CanInvertMVEVCMP(SDValue N) {
-  ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N));
-  return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
-}
-
-static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG,
+static SDValue PerformORCombine_i1(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
                                    const ARMSubtarget *Subtarget) {
   // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
   // together with predicates
   EVT VT = N->getValueType(0);
-  SDLoc DL(N);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
-  auto IsFreelyInvertable = [&](SDValue V) {
-    if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
-      return CanInvertMVEVCMP(V);
-    return false;
-  };
-
-  // At least one operand must be freely invertable.
-  if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
-    return SDValue();
-
-  SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
-  SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
-  SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
-  return DAG.getLogicalNOT(DL, And, VT);
+  ARMCC::CondCodes CondCode0 = ARMCC::AL;
+  ARMCC::CondCodes CondCode1 = ARMCC::AL;
+  if (N0->getOpcode() == ARMISD::VCMP)
+    CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2))
+                    ->getZExtValue();
+  else if (N0->getOpcode() == ARMISD::VCMPZ)
+    CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1))
+                    ->getZExtValue();
+  if (N1->getOpcode() == ARMISD::VCMP)
+    CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2))
+                    ->getZExtValue();
+  else if (N1->getOpcode() == ARMISD::VCMPZ)
+    CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1))
+                    ->getZExtValue();
+
+  if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL)
+    return SDValue();
+
+  unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0);
+  unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1);
+
+  if (!isValidMVECond(Opposite0,
+                      N0->getOperand(0)->getValueType(0).isFloatingPoint()) ||
+      !isValidMVECond(Opposite1,
+                      N1->getOperand(0)->getValueType(0).isFloatingPoint()))
+    return SDValue();
+
+  SmallVector<SDValue, 4> Ops0;
+  Ops0.push_back(N0->getOperand(0));
+  if (N0->getOpcode() == ARMISD::VCMP)
+    Ops0.push_back(N0->getOperand(1));
+  Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32));
+  SmallVector<SDValue, 4> Ops1;
+  Ops1.push_back(N1->getOperand(0));
+  if (N1->getOpcode() == ARMISD::VCMP)
+    Ops1.push_back(N1->getOperand(1));
+  Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32));
+
+  SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0);
+  SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1);
+  SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1);
+  return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And,
+                         DCI.DAG.getAllOnesConstant(SDLoc(N), VT));
 }
 
 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
@@ -14549,21 +12492,17 @@ static SDValue PerformORCombine(SDNode *N,
   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
-  if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
-                                        VT == MVT::v8i1 || VT == MVT::v16i1))
-    return PerformORCombine_i1(N, DAG, Subtarget);
-
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
-  if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
+  if (BVN && Subtarget->hasNEON() &&
       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
-        SplatBitSize == 64) {
+    if (SplatBitSize <= 64) {
       EVT VorrVT;
-      SDValue Val =
-          isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
-                            SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
+      SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, dl, VorrVT, VT.is128BitVector(),
+                                      OtherModImm);
       if (Val.getNode()) {
         SDValue Input =
           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
@@ -14614,7 +12553,7 @@ static SDValue PerformORCombine(SDNode *N,
                 // Canonicalize the vector type to make instruction selection
                 // simpler.
                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
-                SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
+                SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
                                              N0->getOperand(1),
                                              N0->getOperand(0),
                                              N1->getOperand(0));
@@ -14624,6 +12563,10 @@ static SDValue PerformORCombine(SDNode *N,
     }
   }
 
+  if (Subtarget->hasMVEIntegerOps() &&
+      (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
+    return PerformORCombine_i1(N, DCI, Subtarget);
+
   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
   // reasonable.
   if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
@@ -14655,27 +12598,6 @@ static SDValue PerformXORCombine(SDNode *N,
       return Result;
   }
 
-  if (Subtarget->hasMVEIntegerOps()) {
-    // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
-    SDValue N0 = N->getOperand(0);
-    SDValue N1 = N->getOperand(1);
-    const TargetLowering *TLI = Subtarget->getTargetLowering();
-    if (TLI->isConstTrueVal(N1) &&
-        (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
-      if (CanInvertMVEVCMP(N0)) {
-        SDLoc DL(N0);
-        ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0));
-
-        SmallVector<SDValue, 4> Ops;
-        Ops.push_back(N0->getOperand(0));
-        if (N0->getOpcode() == ARMISD::VCMP)
-          Ops.push_back(N0->getOperand(1));
-        Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
-        return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
-      }
-    }
-  }
-
   return SDValue();
 }
 
@@ -14712,40 +12634,52 @@ static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
 }
 
 static SDValue FindBFIToCombineWith(SDNode *N) {
-  // We have a BFI in N. Find a BFI it can combine with, if one exists.
+  // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
+  // if one exists.
   APInt ToMask, FromMask;
   SDValue From = ParseBFI(N, ToMask, FromMask);
   SDValue To = N->getOperand(0);
 
+  // Now check for a compatible BFI to merge with. We can pass through BFIs that
+  // aren't compatible, but not if they set the same bit in their destination as
+  // we do (or that of any BFI we're going to combine with).
   SDValue V = To;
-  if (V.getOpcode() != ARMISD::BFI)
-    return SDValue();
+  APInt CombinedToMask = ToMask;
+  while (V.getOpcode() == ARMISD::BFI) {
+    APInt NewToMask, NewFromMask;
+    SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
+    if (NewFrom != From) {
+      // This BFI has a different base. Keep going.
+      CombinedToMask |= NewToMask;
+      V = V.getOperand(0);
+      continue;
+    }
 
-  APInt NewToMask, NewFromMask;
-  SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
-  if (NewFrom != From)
-    return SDValue();
+    // Do the written bits conflict with any we've seen so far?
+    if ((NewToMask & CombinedToMask).getBoolValue())
+      // Conflicting bits - bail out because going further is unsafe.
+      return SDValue();
 
-  // Do the written bits conflict with any we've seen so far?
-  if ((NewToMask & ToMask).getBoolValue())
-    // Conflicting bits.
-    return SDValue();
+    // Are the new bits contiguous when combined with the old bits?
+    if (BitsProperlyConcatenate(ToMask, NewToMask) &&
+        BitsProperlyConcatenate(FromMask, NewFromMask))
+      return V;
+    if (BitsProperlyConcatenate(NewToMask, ToMask) &&
+        BitsProperlyConcatenate(NewFromMask, FromMask))
+      return V;
 
-  // Are the new bits contiguous when combined with the old bits?
-  if (BitsProperlyConcatenate(ToMask, NewToMask) &&
-      BitsProperlyConcatenate(FromMask, NewFromMask))
-    return V;
-  if (BitsProperlyConcatenate(NewToMask, ToMask) &&
-      BitsProperlyConcatenate(NewFromMask, FromMask))
-    return V;
+    // We've seen a write to some bits, so track it.
+    CombinedToMask |= NewToMask;
+    // Keep going...
+    V = V.getOperand(0);
+  }
 
   return SDValue();
 }
 
-static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
+static SDValue PerformBFICombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N1 = N->getOperand(1);
-
   if (N1.getOpcode() == ISD::AND) {
     // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
     // the bits being cleared by the AND are not demanded by the BFI.
@@ -14754,20 +12688,24 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
       return SDValue();
     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
     unsigned LSB = countTrailingZeros(~InvMask);
-    unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
+    unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
     assert(Width <
                static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
            "undefined behavior");
     unsigned Mask = (1u << Width) - 1;
     unsigned Mask2 = N11C->getZExtValue();
     if ((Mask & (~Mask2)) == 0)
-      return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
-                         N->getOperand(0), N1.getOperand(0), N->getOperand(2));
-    return SDValue();
-  }
+      return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
+                             N->getOperand(0), N1.getOperand(0),
+                             N->getOperand(2));
+  } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
+    // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
+    // Keep track of any consecutive bits set that all come from the same base
+    // value. We can combine these together into a single BFI.
+    SDValue CombineBFI = FindBFIToCombineWith(N);
+    if (CombineBFI == SDValue())
+      return SDValue();
 
-  // Look for another BFI to combine with.
-  if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
     // We've found a BFI.
     APInt ToMask1, FromMask1;
     SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
@@ -14777,7 +12715,9 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
     assert(From1 == From2);
     (void)From2;
 
-    // Create a new BFI, combining the two together.
+    // First, unlink CombineBFI.
+    DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
+    // Then create a new BFI, combining the two together.
     APInt NewFromMask = FromMask1 | FromMask2;
     APInt NewToMask = ToMask1 | ToMask2;
 
@@ -14785,101 +12725,11 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
     SDLoc dl(N);
 
     if (NewFromMask[0] == 0)
-      From1 = DAG.getNode(
-          ISD::SRL, dl, VT, From1,
-          DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
-    return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
-                       DAG.getConstant(~NewToMask, dl, VT));
-  }
-
-  // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
-  // that lower bit insertions are performed first, providing that M1 and M2
-  // do no overlap. This can allow multiple BFI instructions to be combined
-  // together by the other folds above.
-  if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
-    APInt ToMask1 = ~N->getConstantOperandAPInt(2);
-    APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
-
-    if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
-        ToMask1.countLeadingZeros() < ToMask2.countLeadingZeros())
-      return SDValue();
-
-    EVT VT = N->getValueType(0);
-    SDLoc dl(N);
-    SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
-                               N->getOperand(1), N->getOperand(2));
-    return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
-                       N0.getOperand(2));
-  }
-
-  return SDValue();
-}
-
-// Check that N is CMPZ(CSINC(0, 0, CC, X)),
-//              or CMPZ(CMOV(1, 0, CC, $cpsr, X))
-// return X if valid.
-static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) {
-  if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
-    return SDValue();
-  SDValue CSInc = Cmp->getOperand(0);
-
-  // Ignore any `And 1` nodes that may not yet have been removed. We are
-  // looking for a value that produces 1/0, so these have no effect on the
-  // code.
-  while (CSInc.getOpcode() == ISD::AND &&
-         isa<ConstantSDNode>(CSInc.getOperand(1)) &&
-         CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
-    CSInc = CSInc.getOperand(0);
-
-  if (CSInc.getOpcode() == ARMISD::CSINC &&
-      isNullConstant(CSInc.getOperand(0)) &&
-      isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
-    CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
-    return CSInc.getOperand(3);
-  }
-  if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
-      isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
-    CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
-    return CSInc.getOperand(4);
-  }
-  if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
-      isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
-    CC = ARMCC::getOppositeCondition(
-        (ARMCC::CondCodes)CSInc.getConstantOperandVal(2));
-    return CSInc.getOperand(4);
-  }
-  return SDValue();
-}
-
-static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) {
-  // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
-  //       t92: glue = ARMISD::CMPZ t74, 0
-  //     t93: i32 = ARMISD::CSINC 0, 0, 1, t92
-  //   t96: glue = ARMISD::CMPZ t93, 0
-  // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
-  ARMCC::CondCodes Cond;
-  if (SDValue C = IsCMPZCSINC(N, Cond))
-    if (Cond == ARMCC::EQ)
-      return C;
-  return SDValue();
-}
-
-static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG) {
-  // Fold away an unneccessary CMPZ/CSINC
-  // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
-  // if C1==EQ -> CSXYZ A, B, C2, D
-  // if C1==NE -> CSXYZ A, B, NOT(C2), D
-  ARMCC::CondCodes Cond;
-  if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
-    if (N->getConstantOperandVal(2) == ARMCC::EQ)
-      return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
-                         N->getOperand(1),
-                         DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
-    if (N->getConstantOperandVal(2) == ARMCC::NE)
-      return DAG.getNode(
-          N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
-          N->getOperand(1),
-          DAG.getConstant(ARMCC::getOppositeCondition(Cond), SDLoc(N), MVT::i32), C);
+      From1 = DCI.DAG.getNode(
+        ISD::SRL, dl, VT, From1,
+        DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
+    return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
+                           DCI.DAG.getConstant(~NewToMask, dl, VT));
   }
   return SDValue();
 }
@@ -14908,14 +12758,14 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
     SDValue BasePtr = LD->getBasePtr();
     SDValue NewLD1 =
         DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
-                    LD->getAlign(), LD->getMemOperand()->getFlags());
+                    LD->getAlignment(), LD->getMemOperand()->getFlags());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, DL, MVT::i32));
 
     SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
                                  LD->getPointerInfo().getWithOffset(4),
-                                 commonAlignment(LD->getAlign(), 4),
+                                 std::min(4U, LD->getAlignment()),
                                  LD->getMemOperand()->getFlags());
 
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
@@ -14925,54 +12775,6 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
     return Result;
   }
 
-  // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
-  // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
-  if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      isa<ConstantSDNode>(InDouble.getOperand(1))) {
-    SDValue BV = InDouble.getOperand(0);
-    // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
-    // change lane order under big endian.
-    bool BVSwap = BV.getOpcode() == ISD::BITCAST;
-    while (
-        (BV.getOpcode() == ISD::BITCAST ||
-         BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
-        (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
-      BVSwap = BV.getOpcode() == ISD::BITCAST;
-      BV = BV.getOperand(0);
-    }
-    if (BV.getValueType() != MVT::v4i32)
-      return SDValue();
-
-    // Handle buildvectors, pulling out the correct lane depending on
-    // endianness.
-    unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
-    if (BV.getOpcode() == ISD::BUILD_VECTOR) {
-      SDValue Op0 = BV.getOperand(Offset);
-      SDValue Op1 = BV.getOperand(Offset + 1);
-      if (!Subtarget->isLittle() && BVSwap)
-        std::swap(Op0, Op1);
-
-      return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
-    }
-
-    // A chain of insert_vectors, grabbing the correct value of the chain of
-    // inserts.
-    SDValue Op0, Op1;
-    while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
-      if (isa<ConstantSDNode>(BV.getOperand(2))) {
-        if (BV.getConstantOperandVal(2) == Offset)
-          Op0 = BV.getOperand(1);
-        if (BV.getConstantOperandVal(2) == Offset + 1)
-          Op1 = BV.getOperand(1);
-      }
-      BV = BV.getOperand(0);
-    }
-    if (!Subtarget->isLittle() && BVSwap)
-      std::swap(Op0, Op1);
-    if (Op0 && Op1)
-      return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
-  }
-
   return SDValue();
 }
 
@@ -14994,84 +12796,6 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-static SDValue PerformVMOVhrCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI) {
-  SDValue Op0 = N->getOperand(0);
-
-  // VMOVhr (VMOVrh (X)) -> X
-  if (Op0->getOpcode() == ARMISD::VMOVrh)
-    return Op0->getOperand(0);
-
-  // FullFP16: half values are passed in S-registers, and we don't
-  // need any of the bitcast and moves:
-  //
-  //     t2: f32,ch = CopyFromReg t0, Register:f32 %0
-  //   t5: i32 = bitcast t2
-  // t18: f16 = ARMISD::VMOVhr t5
-  if (Op0->getOpcode() == ISD::BITCAST) {
-    SDValue Copy = Op0->getOperand(0);
-    if (Copy.getValueType() == MVT::f32 &&
-        Copy->getOpcode() == ISD::CopyFromReg) {
-      SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)};
-      SDValue NewCopy =
-          DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops);
-      return NewCopy;
-    }
-  }
-
-  // fold (VMOVhr (load x)) -> (load (f16*)x)
-  if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
-    if (LN0->hasOneUse() && LN0->isUnindexed() &&
-        LN0->getMemoryVT() == MVT::i16) {
-      SDValue Load =
-          DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
-                          LN0->getBasePtr(), LN0->getMemOperand());
-      DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
-      DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
-      return Load;
-    }
-  }
-
-  // Only the bottom 16 bits of the source register are used.
-  APInt DemandedMask = APInt::getLowBitsSet(32, 16);
-  const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
-    return SDValue(N, 0);
-
-  return SDValue();
-}
-
-static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  EVT VT = N->getValueType(0);
-
-  // fold (VMOVrh (fpconst x)) -> const x
-  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {
-    APFloat V = C->getValueAPF();
-    return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
-  }
-
-  // fold (VMOVrh (load x)) -> (zextload (i16*)x)
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-
-    SDValue Load =
-        DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
-                       LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
-    DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
-    return Load;
-  }
-
-  // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
-  if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      isa<ConstantSDNode>(N0->getOperand(1)))
-    return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
-                       N0->getOperand(1));
-
-  return SDValue();
-}
-
 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
 /// i64 vector to have f64 elements, since the value can then be loaded
@@ -15222,55 +12946,15 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     // If the valuetypes are the same, we can remove the cast entirely.
     if (Op->getOperand(0).getValueType() == VT)
       return Op->getOperand(0);
-    return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
+    return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl,
+                           Op->getOperand(0).getValueType(), Op->getOperand(0));
   }
 
-  // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
-  // more VPNOT which might get folded as else predicates.
-  if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
-    SDValue X =
-        DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
-    SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
-                                DCI.DAG.getConstant(65535, dl, MVT::i32));
-    return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
-  }
-
-  // Only the bottom 16 bits of the source register are used.
-  if (Op.getValueType() == MVT::i32) {
-    APInt DemandedMask = APInt::getLowBitsSet(32, 16);
-    const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-    if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
-      return SDValue(N, 0);
-  }
   return SDValue();
 }
 
-static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG,
-                                             const ARMSubtarget *ST) {
-  EVT VT = N->getValueType(0);
-  SDValue Op = N->getOperand(0);
-  SDLoc dl(N);
-
-  // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
-  if (ST->isLittle())
-    return DAG.getNode(ISD::BITCAST, dl, VT, Op);
-
-  // VECTOR_REG_CAST undef -> undef
-  if (Op.isUndef())
-    return DAG.getUNDEF(VT);
-
-  // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
-  if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
-    // If the valuetypes are the same, we can remove the cast entirely.
-    if (Op->getOperand(0).getValueType() == VT)
-      return Op->getOperand(0);
-    return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
-  }
-
-  return SDValue();
-}
-
-static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue PerformVCMPCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
   if (!Subtarget->hasMVEIntegerOps())
     return SDValue();
@@ -15284,18 +12968,19 @@ static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG,
 
   // vcmp X, 0, cc -> vcmpz X, cc
   if (isZeroVector(Op1))
-    return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
+    return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0,
+                           N->getOperand(2));
 
   unsigned SwappedCond = getSwappedCondition(Cond);
   if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
     // vcmp 0, X, cc -> vcmpz X, reversed(cc)
     if (isZeroVector(Op0))
-      return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
-                         DAG.getConstant(SwappedCond, dl, MVT::i32));
+      return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
+                             DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
     // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
     if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
-      return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
-                         DAG.getConstant(SwappedCond, dl, MVT::i32));
+      return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
+                             DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
   }
 
   return SDValue();
@@ -15327,265 +13012,9 @@ static SDValue PerformInsertEltCombine(SDNode *N,
   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
 }
 
-// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
-// directly or bitcast to an integer if the original is a float vector.
-// extract(x, n); extract(x, n+1)  ->  VMOVRRD(extract v2f64 x, n/2)
-// bitcast(extract(x, n)); bitcast(extract(x, n+1))  ->  VMOVRRD(extract x, n/2)
-static SDValue
-PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
-  EVT VT = N->getValueType(0);
-  SDLoc dl(N);
-
-  if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
-      !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
-    return SDValue();
-
-  SDValue Ext = SDValue(N, 0);
-  if (Ext.getOpcode() == ISD::BITCAST &&
-      Ext.getOperand(0).getValueType() == MVT::f32)
-    Ext = Ext.getOperand(0);
-  if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-      !isa<ConstantSDNode>(Ext.getOperand(1)) ||
-      Ext.getConstantOperandVal(1) % 2 != 0)
-    return SDValue();
-  if (Ext->use_size() == 1 &&
-      (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||
-       Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))
-    return SDValue();
-
-  SDValue Op0 = Ext.getOperand(0);
-  EVT VecVT = Op0.getValueType();
-  unsigned ResNo = Op0.getResNo();
-  unsigned Lane = Ext.getConstantOperandVal(1);
-  if (VecVT.getVectorNumElements() != 4)
-    return SDValue();
-
-  // Find another extract, of Lane + 1
-  auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {
-    return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-           isa<ConstantSDNode>(V->getOperand(1)) &&
-           V->getConstantOperandVal(1) == Lane + 1 &&
-           V->getOperand(0).getResNo() == ResNo;
-  });
-  if (OtherIt == Op0->uses().end())
-    return SDValue();
-
-  // For float extracts, we need to be converting to a i32 for both vector
-  // lanes.
-  SDValue OtherExt(*OtherIt, 0);
-  if (OtherExt.getValueType() != MVT::i32) {
-    if (OtherExt->use_size() != 1 ||
-        OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||
-        OtherExt->use_begin()->getValueType(0) != MVT::i32)
-      return SDValue();
-    OtherExt = SDValue(*OtherExt->use_begin(), 0);
-  }
-
-  // Convert the type to a f64 and extract with a VMOVRRD.
-  SDValue F64 = DCI.DAG.getNode(
-      ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-      DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
-      DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
-  SDValue VMOVRRD =
-      DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
-
-  DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
-  return VMOVRRD;
-}
-
-static SDValue PerformExtractEltCombine(SDNode *N,
-                                        TargetLowering::DAGCombinerInfo &DCI,
-                                        const ARMSubtarget *ST) {
-  SDValue Op0 = N->getOperand(0);
-  EVT VT = N->getValueType(0);
-  SDLoc dl(N);
-
-  // extract (vdup x) -> x
-  if (Op0->getOpcode() == ARMISD::VDUP) {
-    SDValue X = Op0->getOperand(0);
-    if (VT == MVT::f16 && X.getValueType() == MVT::i32)
-      return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
-    if (VT == MVT::i32 && X.getValueType() == MVT::f16)
-      return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
-    if (VT == MVT::f32 && X.getValueType() == MVT::i32)
-      return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
-
-    while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
-      X = X->getOperand(0);
-    if (X.getValueType() == VT)
-      return X;
-  }
-
-  // extract ARM_BUILD_VECTOR -> x
-  if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
-      isa<ConstantSDNode>(N->getOperand(1)) &&
-      N->getConstantOperandVal(1) < Op0.getNumOperands()) {
-    return Op0.getOperand(N->getConstantOperandVal(1));
-  }
-
-  // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
-  if (Op0.getValueType() == MVT::v4i32 &&
-      isa<ConstantSDNode>(N->getOperand(1)) &&
-      Op0.getOpcode() == ISD::BITCAST &&
-      Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
-      Op0.getOperand(0).getValueType() == MVT::v2f64) {
-    SDValue BV = Op0.getOperand(0);
-    unsigned Offset = N->getConstantOperandVal(1);
-    SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
-    if (MOV.getOpcode() == ARMISD::VMOVDRR)
-      return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
-  }
-
-  // extract x, n; extract x, n+1  ->  VMOVRRD x
-  if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
-    return R;
-
-  // extract (MVETrunc(x)) -> extract x
-  if (Op0->getOpcode() == ARMISD::MVETRUNC) {
-    unsigned Idx = N->getConstantOperandVal(1);
-    unsigned Vec =
-        Idx / Op0->getOperand(0).getValueType().getVectorNumElements();
-    unsigned SubIdx =
-        Idx % Op0->getOperand(0).getValueType().getVectorNumElements();
-    return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
-                           DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
-  }
-
-  return SDValue();
-}
-
-static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue Op = N->getOperand(0);
-  EVT VT = N->getValueType(0);
-
-  // sext_inreg(VGETLANEu) -> VGETLANEs
-  if (Op.getOpcode() == ARMISD::VGETLANEu &&
-      cast<VTSDNode>(N->getOperand(1))->getVT() ==
-          Op.getOperand(0).getValueType().getScalarType())
-    return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
-                       Op.getOperand(1));
-
-  return SDValue();
-}
-
-// When lowering complex nodes that we recognize, like VQDMULH and MULH, we
-// can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to
-// binop as the shuffles cancel out.
-static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT)
-    return SDValue();
-  SDValue Op = N->getOperand(0);
-
-  // Looking for binary operators that will have been folded from
-  // truncates/extends.
-  switch (Op.getOpcode()) {
-  case ARMISD::VQDMULH:
-  case ISD::MULHS:
-  case ISD::MULHU:
-  case ISD::ABDS:
-  case ISD::ABDU:
-  case ISD::AVGFLOORS:
-  case ISD::AVGFLOORU:
-  case ISD::AVGCEILS:
-  case ISD::AVGCEILU:
-    break;
-  default:
-    return SDValue();
-  }
-
-  ShuffleVectorSDNode *Op0 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0));
-  ShuffleVectorSDNode *Op1 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1));
-  if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() ||
-      !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() ||
-      Op0->getOperand(0).getValueType() != VT)
-    return SDValue();
-
-  // Check the mask turns into an identity shuffle.
-  ArrayRef<int> NMask = N->getMask();
-  ArrayRef<int> OpMask = Op0->getMask();
-  for (int i = 0, e = NMask.size(); i != e; i++) {
-    if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i)
-      return SDValue();
-  }
-
-  return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
-                     Op0->getOperand(0), Op1->getOperand(0));
-}
-
-static SDValue
-PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
-  SDValue Vec = N->getOperand(0);
-  SDValue SubVec = N->getOperand(1);
-  uint64_t IdxVal = N->getConstantOperandVal(2);
-  EVT VecVT = Vec.getValueType();
-  EVT SubVT = SubVec.getValueType();
-
-  // Only do this for legal fixed vector types.
-  if (!VecVT.isFixedLengthVector() ||
-      !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
-      !DCI.DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
-    return SDValue();
-
-  // Ignore widening patterns.
-  if (IdxVal == 0 && Vec.isUndef())
-    return SDValue();
-
-  // Subvector must be half the width and an "aligned" insertion.
-  unsigned NumSubElts = SubVT.getVectorNumElements();
-  if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
-      (IdxVal != 0 && IdxVal != NumSubElts))
-    return SDValue();
-
-  // Fold insert_subvector -> concat_vectors
-  // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
-  // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
-  SDLoc DL(N);
-  SDValue Lo, Hi;
-  if (IdxVal == 0) {
-    Lo = SubVec;
-    Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
-                         DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
-  } else {
-    Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
-                         DCI.DAG.getVectorIdxConstant(0, DL));
-    Hi = SubVec;
-  }
-  return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
-}
-
-// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
-static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N,
-                                          SelectionDAG &DAG) {
-  SDValue Trunc = N->getOperand(0);
-  EVT VT = Trunc.getValueType();
-  if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
-    return SDValue();
-
-  SDLoc DL(Trunc);
-  if (isVMOVNTruncMask(N->getMask(), VT, false))
-    return DAG.getNode(
-        ARMISD::VMOVN, DL, VT,
-        DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
-        DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
-        DAG.getConstant(1, DL, MVT::i32));
-  else if (isVMOVNTruncMask(N->getMask(), VT, true))
-    return DAG.getNode(
-        ARMISD::VMOVN, DL, VT,
-        DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
-        DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
-        DAG.getConstant(1, DL, MVT::i32));
-  return SDValue();
-}
-
 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
 /// ISD::VECTOR_SHUFFLE.
 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
-  if (SDValue R = FlattenVectorShuffle(cast<ShuffleVectorSDNode>(N), DAG))
-    return R;
-  if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG))
-    return R;
-
   // The LLVM shufflevector instruction does not require the shuffle mask
   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
@@ -15635,388 +13064,6 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
                               DAG.getUNDEF(VT), NewMask);
 }
 
-/// Load/store instruction that can be merged with a base address
-/// update
-struct BaseUpdateTarget {
-  SDNode *N;
-  bool isIntrinsic;
-  bool isStore;
-  unsigned AddrOpIdx;
-};
-
-struct BaseUpdateUser {
-  /// Instruction that updates a pointer
-  SDNode *N;
-  /// Pointer increment operand
-  SDValue Inc;
-  /// Pointer increment value if it is a constant, or 0 otherwise
-  unsigned ConstInc;
-};
-
-static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
-                                 struct BaseUpdateUser &User,
-                                 bool SimpleConstIncOnly,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDNode *N = Target.N;
-  MemSDNode *MemN = cast<MemSDNode>(N);
-  SDLoc dl(N);
-
-  // Find the new opcode for the updating load/store.
-  bool isLoadOp = true;
-  bool isLaneOp = false;
-  // Workaround for vst1x and vld1x intrinsics which do not have alignment
-  // as an operand.
-  bool hasAlignment = true;
-  unsigned NewOpc = 0;
-  unsigned NumVecs = 0;
-  if (Target.isIntrinsic) {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-    switch (IntNo) {
-    default:
-      llvm_unreachable("unexpected intrinsic for Neon base update");
-    case Intrinsic::arm_neon_vld1:
-      NewOpc = ARMISD::VLD1_UPD;
-      NumVecs = 1;
-      break;
-    case Intrinsic::arm_neon_vld2:
-      NewOpc = ARMISD::VLD2_UPD;
-      NumVecs = 2;
-      break;
-    case Intrinsic::arm_neon_vld3:
-      NewOpc = ARMISD::VLD3_UPD;
-      NumVecs = 3;
-      break;
-    case Intrinsic::arm_neon_vld4:
-      NewOpc = ARMISD::VLD4_UPD;
-      NumVecs = 4;
-      break;
-    case Intrinsic::arm_neon_vld1x2:
-      NewOpc = ARMISD::VLD1x2_UPD;
-      NumVecs = 2;
-      hasAlignment = false;
-      break;
-    case Intrinsic::arm_neon_vld1x3:
-      NewOpc = ARMISD::VLD1x3_UPD;
-      NumVecs = 3;
-      hasAlignment = false;
-      break;
-    case Intrinsic::arm_neon_vld1x4:
-      NewOpc = ARMISD::VLD1x4_UPD;
-      NumVecs = 4;
-      hasAlignment = false;
-      break;
-    case Intrinsic::arm_neon_vld2dup:
-      NewOpc = ARMISD::VLD2DUP_UPD;
-      NumVecs = 2;
-      break;
-    case Intrinsic::arm_neon_vld3dup:
-      NewOpc = ARMISD::VLD3DUP_UPD;
-      NumVecs = 3;
-      break;
-    case Intrinsic::arm_neon_vld4dup:
-      NewOpc = ARMISD::VLD4DUP_UPD;
-      NumVecs = 4;
-      break;
-    case Intrinsic::arm_neon_vld2lane:
-      NewOpc = ARMISD::VLD2LN_UPD;
-      NumVecs = 2;
-      isLaneOp = true;
-      break;
-    case Intrinsic::arm_neon_vld3lane:
-      NewOpc = ARMISD::VLD3LN_UPD;
-      NumVecs = 3;
-      isLaneOp = true;
-      break;
-    case Intrinsic::arm_neon_vld4lane:
-      NewOpc = ARMISD::VLD4LN_UPD;
-      NumVecs = 4;
-      isLaneOp = true;
-      break;
-    case Intrinsic::arm_neon_vst1:
-      NewOpc = ARMISD::VST1_UPD;
-      NumVecs = 1;
-      isLoadOp = false;
-      break;
-    case Intrinsic::arm_neon_vst2:
-      NewOpc = ARMISD::VST2_UPD;
-      NumVecs = 2;
-      isLoadOp = false;
-      break;
-    case Intrinsic::arm_neon_vst3:
-      NewOpc = ARMISD::VST3_UPD;
-      NumVecs = 3;
-      isLoadOp = false;
-      break;
-    case Intrinsic::arm_neon_vst4:
-      NewOpc = ARMISD::VST4_UPD;
-      NumVecs = 4;
-      isLoadOp = false;
-      break;
-    case Intrinsic::arm_neon_vst2lane:
-      NewOpc = ARMISD::VST2LN_UPD;
-      NumVecs = 2;
-      isLoadOp = false;
-      isLaneOp = true;
-      break;
-    case Intrinsic::arm_neon_vst3lane:
-      NewOpc = ARMISD::VST3LN_UPD;
-      NumVecs = 3;
-      isLoadOp = false;
-      isLaneOp = true;
-      break;
-    case Intrinsic::arm_neon_vst4lane:
-      NewOpc = ARMISD::VST4LN_UPD;
-      NumVecs = 4;
-      isLoadOp = false;
-      isLaneOp = true;
-      break;
-    case Intrinsic::arm_neon_vst1x2:
-      NewOpc = ARMISD::VST1x2_UPD;
-      NumVecs = 2;
-      isLoadOp = false;
-      hasAlignment = false;
-      break;
-    case Intrinsic::arm_neon_vst1x3:
-      NewOpc = ARMISD::VST1x3_UPD;
-      NumVecs = 3;
-      isLoadOp = false;
-      hasAlignment = false;
-      break;
-    case Intrinsic::arm_neon_vst1x4:
-      NewOpc = ARMISD::VST1x4_UPD;
-      NumVecs = 4;
-      isLoadOp = false;
-      hasAlignment = false;
-      break;
-    }
-  } else {
-    isLaneOp = true;
-    switch (N->getOpcode()) {
-    default:
-      llvm_unreachable("unexpected opcode for Neon base update");
-    case ARMISD::VLD1DUP:
-      NewOpc = ARMISD::VLD1DUP_UPD;
-      NumVecs = 1;
-      break;
-    case ARMISD::VLD2DUP:
-      NewOpc = ARMISD::VLD2DUP_UPD;
-      NumVecs = 2;
-      break;
-    case ARMISD::VLD3DUP:
-      NewOpc = ARMISD::VLD3DUP_UPD;
-      NumVecs = 3;
-      break;
-    case ARMISD::VLD4DUP:
-      NewOpc = ARMISD::VLD4DUP_UPD;
-      NumVecs = 4;
-      break;
-    case ISD::LOAD:
-      NewOpc = ARMISD::VLD1_UPD;
-      NumVecs = 1;
-      isLaneOp = false;
-      break;
-    case ISD::STORE:
-      NewOpc = ARMISD::VST1_UPD;
-      NumVecs = 1;
-      isLaneOp = false;
-      isLoadOp = false;
-      break;
-    }
-  }
-
-  // Find the size of memory referenced by the load/store.
-  EVT VecTy;
-  if (isLoadOp) {
-    VecTy = N->getValueType(0);
-  } else if (Target.isIntrinsic) {
-    VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
-  } else {
-    assert(Target.isStore &&
-           "Node has to be a load, a store, or an intrinsic!");
-    VecTy = N->getOperand(1).getValueType();
-  }
-
-  bool isVLDDUPOp =
-      NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
-      NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
-
-  unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
-  if (isLaneOp || isVLDDUPOp)
-    NumBytes /= VecTy.getVectorNumElements();
-
-  if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
-    // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
-    // separate instructions that make it harder to use a non-constant update.
-    return false;
-  }
-
-  if (SimpleConstIncOnly && User.ConstInc != NumBytes)
-    return false;
-
-  // OK, we found an ADD we can fold into the base update.
-  // Now, create a _UPD node, taking care of not breaking alignment.
-
-  EVT AlignedVecTy = VecTy;
-  Align Alignment = MemN->getAlign();
-
-  // If this is a less-than-standard-aligned load/store, change the type to
-  // match the standard alignment.
-  // The alignment is overlooked when selecting _UPD variants; and it's
-  // easier to introduce bitcasts here than fix that.
-  // There are 3 ways to get to this base-update combine:
-  // - intrinsics: they are assumed to be properly aligned (to the standard
-  //   alignment of the memory type), so we don't need to do anything.
-  // - ARMISD::VLDx nodes: they are only generated from the aforementioned
-  //   intrinsics, so, likewise, there's nothing to do.
-  // - generic load/store instructions: the alignment is specified as an
-  //   explicit operand, rather than implicitly as the standard alignment
-  //   of the memory type (like the intrisics).  We need to change the
-  //   memory type to match the explicit alignment.  That way, we don't
-  //   generate non-standard-aligned ARMISD::VLDx nodes.
-  if (isa<LSBaseSDNode>(N)) {
-    if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
-      MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
-      assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
-      assert(!isLaneOp && "Unexpected generic load/store lane.");
-      unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
-      AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
-    }
-    // Don't set an explicit alignment on regular load/stores that we want
-    // to transform to VLD/VST 1_UPD nodes.
-    // This matches the behavior of regular load/stores, which only get an
-    // explicit alignment if the MMO alignment is larger than the standard
-    // alignment of the memory type.
-    // Intrinsics, however, always get an explicit alignment, set to the
-    // alignment of the MMO.
-    Alignment = Align(1);
-  }
-
-  // Create the new updating load/store node.
-  // First, create an SDVTList for the new updating node's results.
-  EVT Tys[6];
-  unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
-  unsigned n;
-  for (n = 0; n < NumResultVecs; ++n)
-    Tys[n] = AlignedVecTy;
-  Tys[n++] = MVT::i32;
-  Tys[n] = MVT::Other;
-  SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
-
-  // Then, gather the new node's operands.
-  SmallVector<SDValue, 8> Ops;
-  Ops.push_back(N->getOperand(0)); // incoming chain
-  Ops.push_back(N->getOperand(Target.AddrOpIdx));
-  Ops.push_back(User.Inc);
-
-  if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
-    // Try to match the intrinsic's signature
-    Ops.push_back(StN->getValue());
-  } else {
-    // Loads (and of course intrinsics) match the intrinsics' signature,
-    // so just add all but the alignment operand.
-    unsigned LastOperand =
-        hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
-    for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
-      Ops.push_back(N->getOperand(i));
-  }
-
-  // For all node types, the alignment operand is always the last one.
-  Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
-
-  // If this is a non-standard-aligned STORE, the penultimate operand is the
-  // stored value.  Bitcast it to the aligned type.
-  if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
-    SDValue &StVal = Ops[Ops.size() - 2];
-    StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
-  }
-
-  EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
-  SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
-                                         MemN->getMemOperand());
-
-  // Update the uses.
-  SmallVector<SDValue, 5> NewResults;
-  for (unsigned i = 0; i < NumResultVecs; ++i)
-    NewResults.push_back(SDValue(UpdN.getNode(), i));
-
-  // If this is an non-standard-aligned LOAD, the first result is the loaded
-  // value.  Bitcast it to the expected result type.
-  if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
-    SDValue &LdVal = NewResults[0];
-    LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
-  }
-
-  NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
-  DCI.CombineTo(N, NewResults);
-  DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
-
-  return true;
-}
-
-// If (opcode ptr inc) is and ADD-like instruction, return the
-// increment value. Otherwise return 0.
-static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
-                                         SDValue Inc, const SelectionDAG &DAG) {
-  ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
-  if (!CInc)
-    return 0;
-
-  switch (Opcode) {
-  case ARMISD::VLD1_UPD:
-  case ISD::ADD:
-    return CInc->getZExtValue();
-  case ISD::OR: {
-    if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
-      // (OR ptr inc) is the same as (ADD ptr inc)
-      return CInc->getZExtValue();
-    }
-    return 0;
-  }
-  default:
-    return 0;
-  }
-}
-
-static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) {
-  switch (N->getOpcode()) {
-  case ISD::ADD:
-  case ISD::OR: {
-    if (isa<ConstantSDNode>(N->getOperand(1))) {
-      *Ptr = N->getOperand(0);
-      *CInc = N->getOperand(1);
-      return true;
-    }
-    return false;
-  }
-  case ARMISD::VLD1_UPD: {
-    if (isa<ConstantSDNode>(N->getOperand(2))) {
-      *Ptr = N->getOperand(1);
-      *CInc = N->getOperand(2);
-      return true;
-    }
-    return false;
-  }
-  default:
-    return false;
-  }
-}
-
-static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
-  // Check that the add is independent of the load/store.
-  // Otherwise, folding it would create a cycle. Search through Addr
-  // as well, since the User may not be a direct user of Addr and
-  // only share a base pointer.
-  SmallPtrSet<const SDNode *, 32> Visited;
-  SmallVector<const SDNode *, 16> Worklist;
-  Worklist.push_back(N);
-  Worklist.push_back(User);
-  if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
-      SDNode::hasPredecessorHelper(User, Visited, Worklist))
-    return false;
-  return true;
-}
-
 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
 /// NEON load/store intrinsics, and generic vector load/stores, to merge
 /// base address updates.
@@ -16024,125 +13071,18 @@ static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
 /// The caller is assumed to have checked legality.
 static SDValue CombineBaseUpdate(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
   const bool isStore = N->getOpcode() == ISD::STORE;
   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
-  BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
-
   SDValue Addr = N->getOperand(AddrOpIdx);
-
-  SmallVector<BaseUpdateUser, 8> BaseUpdates;
-
-  // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
-         UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (UI.getUse().getResNo() != Addr.getResNo() ||
-        User->getNumOperands() != 2)
-      continue;
-
-    SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);
-    unsigned ConstInc =
-        getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
-
-    if (ConstInc || User->getOpcode() == ISD::ADD)
-      BaseUpdates.push_back({User, Inc, ConstInc});
-  }
-
-  // If the address is a constant pointer increment itself, find
-  // another constant increment that has the same base operand
-  SDValue Base;
-  SDValue CInc;
-  if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
-    unsigned Offset =
-        getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
-    for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
-         UI != UE; ++UI) {
-
-      SDNode *User = *UI;
-      if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||
-          User->getNumOperands() != 2)
-        continue;
-
-      SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0);
-      unsigned UserOffset =
-          getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
-
-      if (!UserOffset || UserOffset <= Offset)
-        continue;
-
-      unsigned NewConstInc = UserOffset - Offset;
-      SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
-      BaseUpdates.push_back({User, NewInc, NewConstInc});
-    }
-  }
-
-  // Try to fold the load/store with an update that matches memory
-  // access size. This should work well for sequential loads.
-  //
-  // Filter out invalid updates as well.
-  unsigned NumValidUpd = BaseUpdates.size();
-  for (unsigned I = 0; I < NumValidUpd;) {
-    BaseUpdateUser &User = BaseUpdates[I];
-    if (!isValidBaseUpdate(N, User.N)) {
-      --NumValidUpd;
-      std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);
-      continue;
-    }
-
-    if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
-      return SDValue();
-    ++I;
-  }
-  BaseUpdates.resize(NumValidUpd);
-
-  // Try to fold with other users. Non-constant updates are considered
-  // first, and constant updates are sorted to not break a sequence of
-  // strided accesses (if there is any).
-  std::stable_sort(BaseUpdates.begin(), BaseUpdates.end(),
-                   [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
-                     return LHS.ConstInc < RHS.ConstInc;
-                   });
-  for (BaseUpdateUser &User : BaseUpdates) {
-    if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
-      return SDValue();
-  }
-  return SDValue();
-}
-
-static SDValue PerformVLDCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
-  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
-    return SDValue();
-
-  return CombineBaseUpdate(N, DCI);
-}
-
-static SDValue PerformMVEVLDCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI) {
-  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-  SDValue Addr = N->getOperand(2);
   MemSDNode *MemN = cast<MemSDNode>(N);
   SDLoc dl(N);
 
-  // For the stores, where there are multiple intrinsics we only actually want
-  // to post-inc the last of the them.
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-  if (IntNo == Intrinsic::arm_mve_vst2q &&
-      cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1)
-    return SDValue();
-  if (IntNo == Intrinsic::arm_mve_vst4q &&
-      cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3)
-    return SDValue();
-
   // Search for a use of the address operand that is an increment.
   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
-                            UE = Addr.getNode()->use_end();
-       UI != UE; ++UI) {
+         UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
     SDNode *User = *UI;
     if (User->getOpcode() != ISD::ADD ||
         UI.getUse().getResNo() != Addr.getResNo())
@@ -16162,46 +13102,126 @@ static SDValue PerformMVEVLDCombine(SDNode *N,
 
     // Find the new opcode for the updating load/store.
     bool isLoadOp = true;
+    bool isLaneOp = false;
     unsigned NewOpc = 0;
     unsigned NumVecs = 0;
-    switch (IntNo) {
-    default:
-      llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
-    case Intrinsic::arm_mve_vld2q:
-      NewOpc = ARMISD::VLD2_UPD;
-      NumVecs = 2;
-      break;
-    case Intrinsic::arm_mve_vld4q:
-      NewOpc = ARMISD::VLD4_UPD;
-      NumVecs = 4;
-      break;
-    case Intrinsic::arm_mve_vst2q:
-      NewOpc = ARMISD::VST2_UPD;
-      NumVecs = 2;
-      isLoadOp = false;
-      break;
-    case Intrinsic::arm_mve_vst4q:
-      NewOpc = ARMISD::VST4_UPD;
-      NumVecs = 4;
-      isLoadOp = false;
-      break;
+    if (isIntrinsic) {
+      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      switch (IntNo) {
+      default: llvm_unreachable("unexpected intrinsic for Neon base update");
+      case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
+        NumVecs = 1; break;
+      case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
+        NumVecs = 2; break;
+      case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
+        NumVecs = 3; break;
+      case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
+        NumVecs = 4; break;
+      case Intrinsic::arm_neon_vld2dup:
+      case Intrinsic::arm_neon_vld3dup:
+      case Intrinsic::arm_neon_vld4dup:
+        // TODO: Support updating VLDxDUP nodes. For now, we just skip
+        // combining base updates for such intrinsics.
+        continue;
+      case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
+        NumVecs = 2; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
+        NumVecs = 3; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
+        NumVecs = 4; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
+        NumVecs = 1; isLoadOp = false; break;
+      case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
+        NumVecs = 2; isLoadOp = false; break;
+      case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
+        NumVecs = 3; isLoadOp = false; break;
+      case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
+        NumVecs = 4; isLoadOp = false; break;
+      case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
+        NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
+        NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
+        NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
+      }
+    } else {
+      isLaneOp = true;
+      switch (N->getOpcode()) {
+      default: llvm_unreachable("unexpected opcode for Neon base update");
+      case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
+      case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
+      case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
+      case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
+      case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
+        NumVecs = 1; isLaneOp = false; break;
+      case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
+        NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
+      }
     }
 
     // Find the size of memory referenced by the load/store.
     EVT VecTy;
     if (isLoadOp) {
       VecTy = N->getValueType(0);
+    } else if (isIntrinsic) {
+      VecTy = N->getOperand(AddrOpIdx+1).getValueType();
     } else {
-      VecTy = N->getOperand(3).getValueType();
+      assert(isStore && "Node has to be a load, a store, or an intrinsic!");
+      VecTy = N->getOperand(1).getValueType();
     }
 
     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+    if (isLaneOp)
+      NumBytes /= VecTy.getVectorNumElements();
 
     // If the increment is a constant, it must match the memory ref size.
     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
     ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
-    if (!CInc || CInc->getZExtValue() != NumBytes)
+    if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
+      // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
+      // separate instructions that make it harder to use a non-constant update.
       continue;
+    }
+
+    // OK, we found an ADD we can fold into the base update.
+    // Now, create a _UPD node, taking care of not breaking alignment.
+
+    EVT AlignedVecTy = VecTy;
+    unsigned Alignment = MemN->getAlignment();
+
+    // If this is a less-than-standard-aligned load/store, change the type to
+    // match the standard alignment.
+    // The alignment is overlooked when selecting _UPD variants; and it's
+    // easier to introduce bitcasts here than fix that.
+    // There are 3 ways to get to this base-update combine:
+    // - intrinsics: they are assumed to be properly aligned (to the standard
+    //   alignment of the memory type), so we don't need to do anything.
+    // - ARMISD::VLDx nodes: they are only generated from the aforementioned
+    //   intrinsics, so, likewise, there's nothing to do.
+    // - generic load/store instructions: the alignment is specified as an
+    //   explicit operand, rather than implicitly as the standard alignment
+    //   of the memory type (like the intrisics).  We need to change the
+    //   memory type to match the explicit alignment.  That way, we don't
+    //   generate non-standard-aligned ARMISD::VLDx nodes.
+    if (isa<LSBaseSDNode>(N)) {
+      if (Alignment == 0)
+        Alignment = 1;
+      if (Alignment < VecTy.getScalarSizeInBits() / 8) {
+        MVT EltTy = MVT::getIntegerVT(Alignment * 8);
+        assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
+        assert(!isLaneOp && "Unexpected generic load/store lane.");
+        unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
+        AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
+      }
+      // Don't set an explicit alignment on regular load/stores that we want
+      // to transform to VLD/VST 1_UPD nodes.
+      // This matches the behavior of regular load/stores, which only get an
+      // explicit alignment if the MMO alignment is larger than the standard
+      // alignment of the memory type.
+      // Intrinsics, however, always get an explicit alignment, set to the
+      // alignment of the MMO.
+      Alignment = 1;
+    }
 
     // Create the new updating load/store node.
     // First, create an SDVTList for the new updating node's results.
@@ -16209,21 +13229,39 @@ static SDValue PerformMVEVLDCombine(SDNode *N,
     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
     unsigned n;
     for (n = 0; n < NumResultVecs; ++n)
-      Tys[n] = VecTy;
+      Tys[n] = AlignedVecTy;
     Tys[n++] = MVT::i32;
     Tys[n] = MVT::Other;
-    SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
+    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
 
     // Then, gather the new node's operands.
     SmallVector<SDValue, 8> Ops;
     Ops.push_back(N->getOperand(0)); // incoming chain
-    Ops.push_back(N->getOperand(2)); // ptr
+    Ops.push_back(N->getOperand(AddrOpIdx));
     Ops.push_back(Inc);
 
-    for (unsigned i = 3; i < N->getNumOperands(); ++i)
-      Ops.push_back(N->getOperand(i));
+    if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
+      // Try to match the intrinsic's signature
+      Ops.push_back(StN->getValue());
+    } else {
+      // Loads (and of course intrinsics) match the intrinsics' signature,
+      // so just add all but the alignment operand.
+      for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
+        Ops.push_back(N->getOperand(i));
+    }
+
+    // For all node types, the alignment operand is always the last one.
+    Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
+
+    // If this is a non-standard-aligned STORE, the penultimate operand is the
+    // stored value.  Bitcast it to the aligned type.
+    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
+      SDValue &StVal = Ops[Ops.size()-2];
+      StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
+    }
 
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
+    EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
                                            MemN->getMemOperand());
 
     // Update the uses.
@@ -16231,16 +13269,30 @@ static SDValue PerformMVEVLDCombine(SDNode *N,
     for (unsigned i = 0; i < NumResultVecs; ++i)
       NewResults.push_back(SDValue(UpdN.getNode(), i));
 
-    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
+    // If this is an non-standard-aligned LOAD, the first result is the loaded
+    // value.  Bitcast it to the expected result type.
+    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
+      SDValue &LdVal = NewResults[0];
+      LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
+    }
+
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
     DCI.CombineTo(N, NewResults);
     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
 
     break;
   }
-
   return SDValue();
 }
 
+static SDValue PerformVLDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  return CombineBaseUpdate(N, DCI);
+}
+
 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
@@ -16293,7 +13345,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   for (n = 0; n < NumVecs; ++n)
     Tys[n] = VT;
   Tys[n] = MVT::Other;
-  SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
+  SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
@@ -16325,21 +13377,8 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
 /// ARMISD::VDUPLANE.
 static SDValue PerformVDUPLANECombine(SDNode *N,
-                                      TargetLowering::DAGCombinerInfo &DCI,
-                                      const ARMSubtarget *Subtarget) {
+                                      TargetLowering::DAGCombinerInfo &DCI) {
   SDValue Op = N->getOperand(0);
-  EVT VT = N->getValueType(0);
-
-  // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
-  if (Subtarget->hasMVEIntegerOps()) {
-    EVT ExtractVT = VT.getVectorElementType();
-    // We need to ensure we are creating a legal type.
-    if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
-      ExtractVT = MVT::i32;
-    SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
-                              N->getOperand(0), N->getOperand(1));
-    return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
-  }
 
   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
@@ -16360,6 +13399,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
   unsigned EltBits;
   if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
     EltSize = 8;
+  EVT VT = N->getValueType(0);
   if (EltSize > VT.getScalarSizeInBits())
     return SDValue();
 
@@ -16367,21 +13407,11 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
 }
 
 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
-static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue PerformVDUPCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
   SDValue Op = N->getOperand(0);
-  SDLoc dl(N);
-
-  if (Subtarget->hasMVEIntegerOps()) {
-    // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
-    // need to come from a GPR.
-    if (Op.getValueType() == MVT::f32)
-      return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
-                         DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
-    else if (Op.getValueType() == MVT::f16)
-      return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
-                         DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
-  }
 
   if (!Subtarget->hasNEON())
     return SDValue();
@@ -16392,12 +13422,12 @@ static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG,
   LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
   if (LD && Op.hasOneUse() && LD->isUnindexed() &&
       LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
-    SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
-                     DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
+    SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
+                      DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
     SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
-    SDValue VLDDup =
-        DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,
-                                LD->getMemoryVT(), LD->getMemOperand());
+    SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
+                                             Ops, LD->getMemoryVT(),
+                                             LD->getMemOperand());
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
     return VLDDup;
   }
@@ -16406,12 +13436,11 @@ static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue PerformLOADCombine(SDNode *N,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const ARMSubtarget *Subtarget) {
+                                  TargetLowering::DAGCombinerInfo &DCI) {
   EVT VT = N->getValueType(0);
 
   // If this is a legal vector load, try to combine it into a VLD1_UPD.
-  if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
+  if (ISD::isNormalLoad(N) && VT.isVector() &&
       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return CombineBaseUpdate(N, DCI);
 
@@ -16495,7 +13524,7 @@ static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
                                  ShuffWide, DAG.getIntPtrConstant(I, DL));
     SDValue Ch =
         DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
-                     St->getAlign(), St->getMemOperand()->getFlags());
+                     St->getAlignment(), St->getMemOperand()->getFlags());
     BasePtr =
         DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
     Chains.push_back(Ch);
@@ -16503,7 +13532,7 @@ static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
-// Try taking a single vector store from an fpround (which would otherwise turn
+// Try taking a single vector store from an truncate (which would otherwise turn
 // into an expensive buildvector) and splitting it into a series of narrowing
 // stores.
 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
@@ -16511,7 +13540,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
   if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
     return SDValue();
   SDValue Trunc = St->getValue();
-  if (Trunc->getOpcode() != ISD::FP_ROUND)
+  if (Trunc->getOpcode() != ISD::TRUNCATE)
     return SDValue();
   EVT FromVT = Trunc->getOperand(0).getValueType();
   EVT ToVT = Trunc.getValueType();
@@ -16521,73 +13550,34 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
   EVT ToEltVT = ToVT.getVectorElementType();
   EVT FromEltVT = FromVT.getVectorElementType();
 
-  if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
-    return SDValue();
-
-  unsigned NumElements = 4;
-  if (FromVT.getVectorNumElements() % NumElements != 0)
+  unsigned NumElements = 0;
+  if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8))
+    NumElements = 4;
+  if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
+    NumElements = 8;
+  if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements ||
+      FromVT.getVectorNumElements() % NumElements != 0)
     return SDValue();
 
-  // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
-  // use the VMOVN over splitting the store. We are looking for patterns of:
-  // !rev: 0 N 1 N+1 2 N+2 ...
-  //  rev: N 0 N+1 1 N+2 2 ...
-  // The shuffle may either be a single source (in which case N = NumElts/2) or
-  // two inputs extended with concat to the same size (in which case N =
-  // NumElts).
-  auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
-    ArrayRef<int> M = SVN->getMask();
-    unsigned NumElts = ToVT.getVectorNumElements();
-    if (SVN->getOperand(1).isUndef())
-      NumElts /= 2;
-
-    unsigned Off0 = Rev ? NumElts : 0;
-    unsigned Off1 = Rev ? 0 : NumElts;
-
-    for (unsigned I = 0; I < NumElts; I += 2) {
-      if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
-        return false;
-      if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
-        return false;
-    }
-
-    return true;
-  };
-
-  if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
-    if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
-      return SDValue();
-
-  LLVMContext &C = *DAG.getContext();
   SDLoc DL(St);
   // Details about the old store
   SDValue Ch = St->getChain();
   SDValue BasePtr = St->getBasePtr();
-  Align Alignment = St->getOriginalAlign();
+  unsigned Alignment = St->getOriginalAlignment();
   MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
   AAMDNodes AAInfo = St->getAAInfo();
 
-  // We split the store into slices of NumElements. fp16 trunc stores are vcvt
-  // and then stored as truncating integer stores.
-  EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
-  EVT NewToVT = EVT::getVectorVT(
-      C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
+  EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements);
+  EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements);
 
   SmallVector<SDValue, 4> Stores;
   for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
     unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
-    SDValue NewPtr =
-        DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
+    SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
 
     SDValue Extract =
         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
                     DAG.getConstant(i * NumElements, DL, MVT::i32));
-
-    SDValue FPTrunc =
-        DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
-                    Extract, DAG.getConstant(0, DL, MVT::i32));
-    Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
-
     SDValue Store = DAG.getTruncStore(
         Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
         NewToVT, Alignment, MMOFlags, AAInfo);
@@ -16596,83 +13586,6 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
 }
 
-// Try taking a single vector store from an MVETRUNC (which would otherwise turn
-// into an expensive buildvector) and splitting it into a series of narrowing
-// stores.
-static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St,
-                                                         SelectionDAG &DAG) {
-  if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
-    return SDValue();
-  SDValue Trunc = St->getValue();
-  if (Trunc->getOpcode() != ARMISD::MVETRUNC)
-    return SDValue();
-  EVT FromVT = Trunc->getOperand(0).getValueType();
-  EVT ToVT = Trunc.getValueType();
-
-  LLVMContext &C = *DAG.getContext();
-  SDLoc DL(St);
-  // Details about the old store
-  SDValue Ch = St->getChain();
-  SDValue BasePtr = St->getBasePtr();
-  Align Alignment = St->getOriginalAlign();
-  MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
-  AAMDNodes AAInfo = St->getAAInfo();
-
-  EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
-                                 FromVT.getVectorNumElements());
-
-  SmallVector<SDValue, 4> Stores;
-  for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
-    unsigned NewOffset =
-        i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
-    SDValue NewPtr =
-        DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
-
-    SDValue Extract = Trunc.getOperand(i);
-    SDValue Store = DAG.getTruncStore(
-        Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
-        NewToVT, Alignment, MMOFlags, AAInfo);
-    Stores.push_back(Store);
-  }
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
-}
-
-// Given a floating point store from an extracted vector, with an integer
-// VGETLANE that already exists, store the existing VGETLANEu directly. This can
-// help reduce fp register pressure, doesn't require the fp extract and allows
-// use of more integer post-inc stores not available with vstr.
-static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) {
-  if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
-    return SDValue();
-  SDValue Extract = St->getValue();
-  EVT VT = Extract.getValueType();
-  // For now only uses f16. This may be useful for f32 too, but that will
-  // be bitcast(extract), not the VGETLANEu we currently check here.
-  if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-    return SDValue();
-
-  SDNode *GetLane =
-      DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
-                          {Extract.getOperand(0), Extract.getOperand(1)});
-  if (!GetLane)
-    return SDValue();
-
-  LLVMContext &C = *DAG.getContext();
-  SDLoc DL(St);
-  // Create a new integer store to replace the existing floating point version.
-  SDValue Ch = St->getChain();
-  SDValue BasePtr = St->getBasePtr();
-  Align Alignment = St->getOriginalAlign();
-  MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
-  AAMDNodes AAInfo = St->getAAInfo();
-  EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
-  SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
-                                    St->getPointerInfo(), NewToVT, Alignment,
-                                    MMOFlags, AAInfo);
-
-  return Store;
-}
-
 /// PerformSTORECombine - Target-specific dag combine xforms for
 /// ISD::STORE.
 static SDValue PerformSTORECombine(SDNode *N,
@@ -16688,15 +13601,9 @@ static SDValue PerformSTORECombine(SDNode *N,
     if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
       return Store;
 
-  if (Subtarget->hasMVEIntegerOps()) {
+  if (Subtarget->hasMVEIntegerOps())
     if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
       return NewToken;
-    if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
-      return NewChain;
-    if (SDValue NewToken =
-            PerformSplittingMVETruncToNarrowingStores(St, DCI.DAG))
-      return NewToken;
-  }
 
   if (!ISD::isNormalStore(St))
     return SDValue();
@@ -16711,15 +13618,15 @@ static SDValue PerformSTORECombine(SDNode *N,
     SDValue BasePtr = St->getBasePtr();
     SDValue NewST1 = DAG.getStore(
         St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
-        BasePtr, St->getPointerInfo(), St->getOriginalAlign(),
+        BasePtr, St->getPointerInfo(), St->getAlignment(),
         St->getMemOperand()->getFlags());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, DL, MVT::i32));
     return DAG.getStore(NewST1.getValue(0), DL,
                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
-                        OffsetPtr, St->getPointerInfo().getWithOffset(4),
-                        St->getOriginalAlign(),
+                        OffsetPtr, St->getPointerInfo(),
+                        std::min(4U, St->getAlignment() / 2),
                         St->getMemOperand()->getFlags());
   }
 
@@ -16743,7 +13650,7 @@ static SDValue PerformSTORECombine(SDNode *N,
     DCI.AddToWorklist(ExtElt.getNode());
     DCI.AddToWorklist(V.getNode());
     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
-                        St->getPointerInfo(), St->getAlign(),
+                        St->getPointerInfo(), St->getAlignment(),
                         St->getMemOperand()->getFlags(), St->getAAInfo());
   }
 
@@ -16812,49 +13719,6 @@ static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
   return FixConv;
 }
 
-static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG,
-                                         const ARMSubtarget *Subtarget) {
-  if (!Subtarget->hasMVEFloatOps())
-    return SDValue();
-
-  // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
-  // The second form can be more easily turned into a predicated vadd, and
-  // possibly combined into a fma to become a predicated vfma.
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  EVT VT = N->getValueType(0);
-  SDLoc DL(N);
-
-  // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
-  // which these VMOV's represent.
-  auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
-    if (Op.getOpcode() != ISD::BITCAST ||
-        Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
-      return false;
-    uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
-    if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
-      return true;
-    if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
-      return true;
-    return false;
-  };
-
-  if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
-    std::swap(Op0, Op1);
-
-  if (Op1.getOpcode() != ISD::VSELECT)
-    return SDValue();
-
-  SDNodeFlags FaddFlags = N->getFlags();
-  bool NSZ = FaddFlags.hasNoSignedZeros();
-  if (!isIdentitySplat(Op1.getOperand(2), NSZ))
-    return SDValue();
-
-  SDValue FAdd =
-      DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
-  return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
-}
-
 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
 /// can replace combinations of VCVT (integer to floating-point) and VDIV
 /// when the VDIV has a constant operand that is a power of 2.
@@ -16914,351 +13778,8 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
                      ConvInput, DAG.getConstant(C, dl, MVT::i32));
 }
 
-static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
-                                           const ARMSubtarget *ST) {
-  if (!ST->hasMVEIntegerOps())
-    return SDValue();
-
-  assert(N->getOpcode() == ISD::VECREDUCE_ADD);
-  EVT ResVT = N->getValueType(0);
-  SDValue N0 = N->getOperand(0);
-  SDLoc dl(N);
-
-  // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
-  if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
-      (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
-       N0.getValueType() == MVT::v16i8)) {
-    SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
-    SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
-    return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
-  }
-
-  // We are looking for something that will have illegal types if left alone,
-  // but that we can convert to a single instruction under MVE. For example
-  // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
-  // or
-  // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
-
-  // The legal cases are:
-  //   VADDV u/s 8/16/32
-  //   VMLAV u/s 8/16/32
-  //   VADDLV u/s 32
-  //   VMLALV u/s 16/32
-
-  // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
-  // extend it and use v4i32 instead.
-  auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
-    EVT AVT = A.getValueType();
-    return any_of(ExtTypes, [&](MVT Ty) {
-      return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
-             AVT.bitsLE(Ty);
-    });
-  };
-  auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
-    EVT AVT = A.getValueType();
-    if (!AVT.is128BitVector())
-      A = DAG.getNode(ExtendCode, dl,
-                      AVT.changeVectorElementType(MVT::getIntegerVT(
-                          128 / AVT.getVectorMinNumElements())),
-                      A);
-    return A;
-  };
-  auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
-    if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
-      return SDValue();
-    SDValue A = N0->getOperand(0);
-    if (ExtTypeMatches(A, ExtTypes))
-      return ExtendIfNeeded(A, ExtendCode);
-    return SDValue();
-  };
-  auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
-                         ArrayRef<MVT> ExtTypes, SDValue &Mask) {
-    if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
-        !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
-      return SDValue();
-    Mask = N0->getOperand(0);
-    SDValue Ext = N0->getOperand(1);
-    if (Ext->getOpcode() != ExtendCode)
-      return SDValue();
-    SDValue A = Ext->getOperand(0);
-    if (ExtTypeMatches(A, ExtTypes))
-      return ExtendIfNeeded(A, ExtendCode);
-    return SDValue();
-  };
-  auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
-                     SDValue &A, SDValue &B) {
-    // For a vmla we are trying to match a larger pattern:
-    // ExtA = sext/zext A
-    // ExtB = sext/zext B
-    // Mul = mul ExtA, ExtB
-    // vecreduce.add Mul
-    // There might also be en extra extend between the mul and the addreduce, so
-    // long as the bitwidth is high enough to make them equivalent (for example
-    // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
-    if (ResVT != RetTy)
-      return false;
-    SDValue Mul = N0;
-    if (Mul->getOpcode() == ExtendCode &&
-        Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
-            ResVT.getScalarSizeInBits())
-      Mul = Mul->getOperand(0);
-    if (Mul->getOpcode() != ISD::MUL)
-      return false;
-    SDValue ExtA = Mul->getOperand(0);
-    SDValue ExtB = Mul->getOperand(1);
-    if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
-      return false;
-    A = ExtA->getOperand(0);
-    B = ExtB->getOperand(0);
-    if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
-      A = ExtendIfNeeded(A, ExtendCode);
-      B = ExtendIfNeeded(B, ExtendCode);
-      return true;
-    }
-    return false;
-  };
-  auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
-                     SDValue &A, SDValue &B, SDValue &Mask) {
-    // Same as the pattern above with a select for the zero predicated lanes
-    // ExtA = sext/zext A
-    // ExtB = sext/zext B
-    // Mul = mul ExtA, ExtB
-    // N0 = select Mask, Mul, 0
-    // vecreduce.add N0
-    if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
-        !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
-      return false;
-    Mask = N0->getOperand(0);
-    SDValue Mul = N0->getOperand(1);
-    if (Mul->getOpcode() == ExtendCode &&
-        Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
-            ResVT.getScalarSizeInBits())
-      Mul = Mul->getOperand(0);
-    if (Mul->getOpcode() != ISD::MUL)
-      return false;
-    SDValue ExtA = Mul->getOperand(0);
-    SDValue ExtB = Mul->getOperand(1);
-    if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
-      return false;
-    A = ExtA->getOperand(0);
-    B = ExtB->getOperand(0);
-    if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
-      A = ExtendIfNeeded(A, ExtendCode);
-      B = ExtendIfNeeded(B, ExtendCode);
-      return true;
-    }
-    return false;
-  };
-  auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
-    // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
-    // reductions. The operands are extended with MVEEXT, but as they are
-    // reductions the lane orders do not matter. MVEEXT may be combined with
-    // loads to produce two extending loads, or else they will be expanded to
-    // VREV/VMOVL.
-    EVT VT = Ops[0].getValueType();
-    if (VT == MVT::v16i8) {
-      assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
-             "Unexpected illegal long reduction opcode");
-      bool IsUnsigned = Opcode == ARMISD::VMLALVu;
-
-      SDValue Ext0 =
-          DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
-                      DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
-      SDValue Ext1 =
-          DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
-                      DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
-
-      SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
-                                 Ext0, Ext1);
-      SDValue MLA1 =
-          DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
-                      DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
-                      Ext0.getValue(1), Ext1.getValue(1));
-      return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
-    }
-    SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
-    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
-                       SDValue(Node.getNode(), 1));
-  };
-
-  SDValue A, B;
-  SDValue Mask;
-  if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
-    return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
-  if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
-    return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
-  if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
-              A, B))
-    return Create64bitNode(ARMISD::VMLALVs, {A, B});
-  if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
-              A, B))
-    return Create64bitNode(ARMISD::VMLALVu, {A, B});
-  if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
-  if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
-
-  if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
-                  Mask))
-    return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
-  if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
-                  Mask))
-    return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
-  if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
-                  Mask))
-    return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
-  if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
-                  Mask))
-    return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
-  if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
-  if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
-
-  if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
-    return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
-  if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
-    return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
-  if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
-    return Create64bitNode(ARMISD::VADDLVs, {A});
-  if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
-    return Create64bitNode(ARMISD::VADDLVu, {A});
-  if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
-  if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
-
-  if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
-    return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
-  if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
-    return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
-  if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
-    return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
-  if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
-    return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
-  if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
-  if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
-
-  // Some complications. We can get a case where the two inputs of the mul are
-  // the same, then the output sext will have been helpfully converted to a
-  // zext. Turn it back.
-  SDValue Op = N0;
-  if (Op->getOpcode() == ISD::VSELECT)
-    Op = Op->getOperand(1);
-  if (Op->getOpcode() == ISD::ZERO_EXTEND &&
-      Op->getOperand(0)->getOpcode() == ISD::MUL) {
-    SDValue Mul = Op->getOperand(0);
-    if (Mul->getOperand(0) == Mul->getOperand(1) &&
-        Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
-      SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
-      if (Op != N0)
-        Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
-                          N0->getOperand(0), Ext, N0->getOperand(2));
-      return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
-    }
-  }
-
-  return SDValue();
-}
-
-static SDValue PerformVMOVNCombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI) {
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  unsigned IsTop = N->getConstantOperandVal(2);
-
-  // VMOVNT a undef -> a
-  // VMOVNB a undef -> a
-  // VMOVNB undef a -> a
-  if (Op1->isUndef())
-    return Op0;
-  if (Op0->isUndef() && !IsTop)
-    return Op1;
-
-  // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
-  // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
-  if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
-       Op1->getOpcode() == ARMISD::VQMOVNu) &&
-      Op1->getConstantOperandVal(2) == 0)
-    return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
-                           Op0, Op1->getOperand(1), N->getOperand(2));
-
-  // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
-  // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
-  // into the top or bottom lanes.
-  unsigned NumElts = N->getValueType(0).getVectorNumElements();
-  APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
-  APInt Op0DemandedElts =
-      IsTop ? Op1DemandedElts
-            : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
-
-  const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
-    return SDValue(N, 0);
-  if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
-    return SDValue(N, 0);
-
-  return SDValue();
-}
-
-static SDValue PerformVQMOVNCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI) {
-  SDValue Op0 = N->getOperand(0);
-  unsigned IsTop = N->getConstantOperandVal(2);
-
-  unsigned NumElts = N->getValueType(0).getVectorNumElements();
-  APInt Op0DemandedElts =
-      APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
-                                     : APInt::getHighBitsSet(2, 1));
-
-  const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
-    return SDValue(N, 0);
-  return SDValue();
-}
-
-static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) {
-  SDLoc DL(N);
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-
-  // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
-  // uses of the intrinsics.
-  if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
-    int ShiftAmt = C->getSExtValue();
-    if (ShiftAmt == 0) {
-      SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
-      DAG.ReplaceAllUsesWith(N, Merge.getNode());
-      return SDValue();
-    }
-
-    if (ShiftAmt >= -32 && ShiftAmt < 0) {
-      unsigned NewOpcode =
-          N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
-      SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
-                                     DAG.getConstant(-ShiftAmt, DL, MVT::i32));
-      DAG.ReplaceAllUsesWith(N, NewShift.getNode());
-      return NewShift;
-    }
-  }
-
-  return SDValue();
-}
-
 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
-SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
-                                                   DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
+static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   switch (IntNo) {
   default:
@@ -17407,72 +13928,6 @@ SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
   case Intrinsic::arm_neon_vqrshiftu:
     // No immediate versions of these to check for.
     break;
-
-  case Intrinsic::arm_mve_vqdmlah:
-  case Intrinsic::arm_mve_vqdmlash:
-  case Intrinsic::arm_mve_vqrdmlah:
-  case Intrinsic::arm_mve_vqrdmlash:
-  case Intrinsic::arm_mve_vmla_n_predicated:
-  case Intrinsic::arm_mve_vmlas_n_predicated:
-  case Intrinsic::arm_mve_vqdmlah_predicated:
-  case Intrinsic::arm_mve_vqdmlash_predicated:
-  case Intrinsic::arm_mve_vqrdmlah_predicated:
-  case Intrinsic::arm_mve_vqrdmlash_predicated: {
-    // These intrinsics all take an i32 scalar operand which is narrowed to the
-    // size of a single lane of the vector type they return. So we don't need
-    // any bits of that operand above that point, which allows us to eliminate
-    // uxth/sxth.
-    unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
-    APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
-    if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
-      return SDValue();
-    break;
-  }
-
-  case Intrinsic::arm_mve_minv:
-  case Intrinsic::arm_mve_maxv:
-  case Intrinsic::arm_mve_minav:
-  case Intrinsic::arm_mve_maxav:
-  case Intrinsic::arm_mve_minv_predicated:
-  case Intrinsic::arm_mve_maxv_predicated:
-  case Intrinsic::arm_mve_minav_predicated:
-  case Intrinsic::arm_mve_maxav_predicated: {
-    // These intrinsics all take an i32 scalar operand which is narrowed to the
-    // size of a single lane of the vector type they take as the other input.
-    unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
-    APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
-    if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
-      return SDValue();
-    break;
-  }
-
-  case Intrinsic::arm_mve_addv: {
-    // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
-    // which allow PerformADDVecReduce to turn it into VADDLV when possible.
-    bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
-    unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
-    return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
-  }
-
-  case Intrinsic::arm_mve_addlv:
-  case Intrinsic::arm_mve_addlv_predicated: {
-    // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
-    // which recombines the two outputs into an i64
-    bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
-    unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
-                    (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
-                    (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
-
-    SmallVector<SDValue, 4> Ops;
-    for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
-      if (i != 2)                      // skip the unsigned flag
-        Ops.push_back(N->getOperand(i));
-
-    SDLoc dl(N);
-    SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
-    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
-                       val.getValue(1));
-  }
   }
 
   return SDValue();
@@ -17488,6 +13943,18 @@ static SDValue PerformShiftCombine(SDNode *N,
                                    const ARMSubtarget *ST) {
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
+  if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
+    // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
+    // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
+    SDValue N1 = N->getOperand(1);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      SDValue N0 = N->getOperand(0);
+      if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
+          DAG.MaskedValueIsZero(N0.getOperand(0),
+                                APInt::getHighBitsSet(32, 16)))
+        return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
+    }
+  }
 
   if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
       N->getOperand(0)->getOpcode() == ISD::AND &&
@@ -17527,7 +13994,7 @@ static SDValue PerformShiftCombine(SDNode *N,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!VT.isVector() || !TLI.isTypeLegal(VT))
     return SDValue();
-  if (ST->hasMVEIntegerOps())
+  if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
     return SDValue();
 
   int64_t Cnt;
@@ -17556,10 +14023,9 @@ static SDValue PerformShiftCombine(SDNode *N,
   return SDValue();
 }
 
-// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
-// split into multiple extending loads, which are simpler to deal with than an
-// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
-// to convert the type to an f32.
+// Look for a sign/zero extend of a larger than legal load. This can be split
+// into two extending loads, which are simpler to deal with than an arbitrary
+// sign extend.
 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   if (N0.getOpcode() != ISD::LOAD)
@@ -17577,66 +14043,49 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
   EVT FromEltVT = FromVT.getVectorElementType();
 
   unsigned NumElements = 0;
-  if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
-    NumElements = 4;
-  if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
+  if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
     NumElements = 4;
+  if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
+    NumElements = 8;
   if (NumElements == 0 ||
-      (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
+      FromVT.getVectorNumElements() == NumElements ||
       FromVT.getVectorNumElements() % NumElements != 0 ||
       !isPowerOf2_32(NumElements))
     return SDValue();
 
-  LLVMContext &C = *DAG.getContext();
   SDLoc DL(LD);
   // Details about the old load
   SDValue Ch = LD->getChain();
   SDValue BasePtr = LD->getBasePtr();
-  Align Alignment = LD->getOriginalAlign();
+  unsigned Alignment = LD->getOriginalAlignment();
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   ISD::LoadExtType NewExtType =
       N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
   SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
-  EVT NewFromVT = EVT::getVectorVT(
-      C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
-  EVT NewToVT = EVT::getVectorVT(
-      C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
-
-  SmallVector<SDValue, 4> Loads;
-  SmallVector<SDValue, 4> Chains;
-  for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
-    unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
-    SDValue NewPtr =
-        DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
-
-    SDValue NewLoad =
-        DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
-                    LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
-                    Alignment, MMOFlags, AAInfo);
-    Loads.push_back(NewLoad);
-    Chains.push_back(SDValue(NewLoad.getNode(), 1));
-  }
-
-  // Float truncs need to extended with VCVTB's into their floating point types.
-  if (FromEltVT == MVT::f16) {
-    SmallVector<SDValue, 4> Extends;
-
-    for (unsigned i = 0; i < Loads.size(); i++) {
-      SDValue LoadBC =
-          DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
-      SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
-                                  DAG.getConstant(0, DL, MVT::i32));
-      Extends.push_back(FPExt);
-    }
-
-    Loads = Extends;
-  }
-
-  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext());
+  EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
+  unsigned NewOffset = NewFromVT.getSizeInBits() / 8;
+  SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+
+  // Split the load in half, each side of which is extended separately. This
+  // is good enough, as legalisation will take it from there. They are either
+  // already legal or they will be split further into something that is
+  // legal.
+  SDValue NewLoad1 =
+      DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset,
+                  LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo);
+  SDValue NewLoad2 =
+      DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
+                  LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
+                  Alignment, MMOFlags, AAInfo);
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                 SDValue(NewLoad1.getNode(), 1),
+                                 SDValue(NewLoad2.getNode(), 1));
   DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2);
 }
 
 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
@@ -17684,164 +14133,6 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
-                                      const ARMSubtarget *ST) {
-  if (ST->hasMVEFloatOps())
-    if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
-      return NewLoad;
-
-  return SDValue();
-}
-
-// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
-// constant bounds.
-static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG,
-                                         const ARMSubtarget *Subtarget) {
-  if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
-      !Subtarget->isThumb2())
-    return SDValue();
-
-  EVT VT = Op.getValueType();
-  SDValue Op0 = Op.getOperand(0);
-
-  if (VT != MVT::i32 ||
-      (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
-      !isa<ConstantSDNode>(Op.getOperand(1)) ||
-      !isa<ConstantSDNode>(Op0.getOperand(1)))
-    return SDValue();
-
-  SDValue Min = Op;
-  SDValue Max = Op0;
-  SDValue Input = Op0.getOperand(0);
-  if (Min.getOpcode() == ISD::SMAX)
-    std::swap(Min, Max);
-
-  APInt MinC = Min.getConstantOperandAPInt(1);
-  APInt MaxC = Max.getConstantOperandAPInt(1);
-
-  if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
-      !(MinC + 1).isPowerOf2())
-    return SDValue();
-
-  SDLoc DL(Op);
-  if (MinC == ~MaxC)
-    return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
-                       DAG.getConstant(MinC.countTrailingOnes(), DL, VT));
-  if (MaxC == 0)
-    return DAG.getNode(ARMISD::USAT, DL, VT, Input,
-                       DAG.getConstant(MinC.countTrailingOnes(), DL, VT));
-
-  return SDValue();
-}
-
-/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
-/// saturates.
-static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
-                                    const ARMSubtarget *ST) {
-  EVT VT = N->getValueType(0);
-  SDValue N0 = N->getOperand(0);
-
-  if (VT == MVT::i32)
-    return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
-
-  if (!ST->hasMVEIntegerOps())
-    return SDValue();
-
-  if (SDValue V = PerformVQDMULHCombine(N, DAG))
-    return V;
-
-  if (VT != MVT::v4i32 && VT != MVT::v8i16)
-    return SDValue();
-
-  auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
-    // Check one is a smin and the other is a smax
-    if (Min->getOpcode() != ISD::SMIN)
-      std::swap(Min, Max);
-    if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
-      return false;
-
-    APInt SaturateC;
-    if (VT == MVT::v4i32)
-      SaturateC = APInt(32, (1 << 15) - 1, true);
-    else //if (VT == MVT::v8i16)
-      SaturateC = APInt(16, (1 << 7) - 1, true);
-
-    APInt MinC, MaxC;
-    if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
-        MinC != SaturateC)
-      return false;
-    if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
-        MaxC != ~SaturateC)
-      return false;
-    return true;
-  };
-
-  if (IsSignedSaturate(N, N0.getNode())) {
-    SDLoc DL(N);
-    MVT ExtVT, HalfVT;
-    if (VT == MVT::v4i32) {
-      HalfVT = MVT::v8i16;
-      ExtVT = MVT::v4i16;
-    } else { // if (VT == MVT::v8i16)
-      HalfVT = MVT::v16i8;
-      ExtVT = MVT::v8i8;
-    }
-
-    // Create a VQMOVNB with undef top lanes, then signed extended into the top
-    // half. That extend will hopefully be removed if only the bottom bits are
-    // demanded (though a truncating store, for example).
-    SDValue VQMOVN =
-        DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
-                    N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
-    SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
-    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
-                       DAG.getValueType(ExtVT));
-  }
-
-  auto IsUnsignedSaturate = [&](SDNode *Min) {
-    // For unsigned, we just need to check for <= 0xffff
-    if (Min->getOpcode() != ISD::UMIN)
-      return false;
-
-    APInt SaturateC;
-    if (VT == MVT::v4i32)
-      SaturateC = APInt(32, (1 << 16) - 1, true);
-    else //if (VT == MVT::v8i16)
-      SaturateC = APInt(16, (1 << 8) - 1, true);
-
-    APInt MinC;
-    if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
-        MinC != SaturateC)
-      return false;
-    return true;
-  };
-
-  if (IsUnsignedSaturate(N)) {
-    SDLoc DL(N);
-    MVT HalfVT;
-    unsigned ExtConst;
-    if (VT == MVT::v4i32) {
-      HalfVT = MVT::v8i16;
-      ExtConst = 0x0000FFFF;
-    } else { //if (VT == MVT::v8i16)
-      HalfVT = MVT::v16i8;
-      ExtConst = 0x00FF;
-    }
-
-    // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
-    // an AND. That extend will hopefully be removed if only the bottom bits are
-    // demanded (though a truncating store, for example).
-    SDValue VQMOVN =
-        DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
-                    DAG.getConstant(0, DL, MVT::i32));
-    SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
-    return DAG.getNode(ISD::AND, DL, VT, Bitcast,
-                       DAG.getConstant(ExtConst, DL, VT));
-  }
-
-  return SDValue();
-}
-
 static const APInt *isPowerOf2Constant(SDValue V) {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   if (!C)
@@ -17963,7 +14254,7 @@ static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
     auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
     if (!Const)
       return SDValue();
-    if (Const->isZero())
+    if (Const->isNullValue())
       Imm = 0;
     else if (Const->isOne())
       Imm = 1;
@@ -17974,7 +14265,7 @@ static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
   }
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
-    if (IntOp != Intrinsic::test_start_loop_iterations &&
+    if (IntOp != Intrinsic::test_set_loop_iterations &&
         IntOp != Intrinsic::loop_decrement_reg)
       return SDValue();
     return N;
@@ -17989,7 +14280,7 @@ static SDValue PerformHWLoopCombine(SDNode *N,
 
   // The hwloop intrinsics that we're interested are used for control-flow,
   // either for entering or exiting the loop:
-  // - test.start.loop.iterations will test whether its operand is zero. If it
+  // - test.set.loop.iterations will test whether its operand is zero. If it
   //   is zero, the proceeding branch should not enter the loop.
   // - loop.decrement.reg also tests whether its operand is zero. If it is
   //   zero, the proceeding branch should not branch back to the beginning of
@@ -18015,7 +14306,7 @@ static SDValue PerformHWLoopCombine(SDNode *N,
     Cond = N->getOperand(2);
     Dest = N->getOperand(4);
     if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
-      if (!Const->isOne() && !Const->isZero())
+      if (!Const->isOne() && !Const->isNullValue())
         return SDValue();
       Imm = Const->getZExtValue();
     } else
@@ -18064,25 +14355,21 @@ static SDValue PerformHWLoopCombine(SDNode *N,
     DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
   };
 
-  if (IntOp == Intrinsic::test_start_loop_iterations) {
+  if (IntOp == Intrinsic::test_set_loop_iterations) {
     SDValue Res;
-    SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
     // We expect this 'instruction' to branch when the counter is zero.
     if (IsTrueIfZero(CC, Imm)) {
-      SDValue Ops[] = {Chain, Setup, Dest};
+      SDValue Ops[] = { Chain, Elements, Dest };
       Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
     } else {
       // The logic is the reverse of what we need for WLS, so find the other
       // basic block target: the target of the proceeding br.
       UpdateUncondBr(Br, Dest, DAG);
 
-      SDValue Ops[] = {Chain, Setup, OtherTarget};
+      SDValue Ops[] = { Chain, Elements, OtherTarget };
       Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
     }
-    // Update LR count to the new value
-    DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
-    // Update chain
-    DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
+    DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
     return Res;
   } else {
     SDValue Size = DAG.getTargetConstant(
@@ -18220,23 +14507,6 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   if (!VT.isInteger())
       return SDValue();
 
-  // Fold away an unneccessary CMPZ/CMOV
-  // CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) ->
-  // if C1==EQ -> CMOV A, B, C2, $cpsr, D
-  // if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D
-  if (N->getConstantOperandVal(2) == ARMCC::EQ ||
-      N->getConstantOperandVal(2) == ARMCC::NE) {
-    ARMCC::CondCodes Cond;
-    if (SDValue C = IsCMPZCSINC(N->getOperand(4).getNode(), Cond)) {
-      if (N->getConstantOperandVal(2) == ARMCC::NE)
-        Cond = ARMCC::getOppositeCondition(Cond);
-      return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
-                         N->getOperand(1),
-                         DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32),
-                         N->getOperand(3), C);
-    }
-  }
-
   // Materialize a boolean comparison for integers so we can avoid branching.
   if (isNullConstant(FalseVal)) {
     if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
@@ -18344,325 +14614,10 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   return Res;
 }
 
-static SDValue PerformBITCASTCombine(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     const ARMSubtarget *ST) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDValue Src = N->getOperand(0);
-  EVT DstVT = N->getValueType(0);
-
-  // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
-  if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
-    EVT SrcVT = Src.getValueType();
-    if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
-      return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
-  }
-
-  // We may have a bitcast of something that has already had this bitcast
-  // combine performed on it, so skip past any VECTOR_REG_CASTs.
-  while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)
-    Src = Src.getOperand(0);
-
-  // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
-  // would be generated is at least the width of the element type.
-  EVT SrcVT = Src.getValueType();
-  if ((Src.getOpcode() == ARMISD::VMOVIMM ||
-       Src.getOpcode() == ARMISD::VMVNIMM ||
-       Src.getOpcode() == ARMISD::VMOVFPIMM) &&
-      SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
-      DAG.getDataLayout().isBigEndian())
-    return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
-
-  // bitcast(extract(x, n)); bitcast(extract(x, n+1))  ->  VMOVRRD x
-  if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
-    return R;
-
-  return SDValue();
-}
-
-// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
-// node into stack operations after legalizeOps.
-SDValue ARMTargetLowering::PerformMVETruncCombine(
-    SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-  SDLoc DL(N);
-
-  // MVETrunc(Undef, Undef) -> Undef
-  if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
-    return DAG.getUNDEF(VT);
-
-  // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
-  if (N->getNumOperands() == 2 &&
-      N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
-      N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
-    return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
-                       N->getOperand(0).getOperand(1),
-                       N->getOperand(1).getOperand(0),
-                       N->getOperand(1).getOperand(1));
-
-  // MVETrunc(shuffle, shuffle) -> VMOVN
-  if (N->getNumOperands() == 2 &&
-      N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
-      N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
-    auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
-    auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
-
-    if (S0->getOperand(0) == S1->getOperand(0) &&
-        S0->getOperand(1) == S1->getOperand(1)) {
-      // Construct complete shuffle mask
-      SmallVector<int, 8> Mask(S0->getMask());
-      Mask.append(S1->getMask().begin(), S1->getMask().end());
-
-      if (isVMOVNTruncMask(Mask, VT, false))
-        return DAG.getNode(
-            ARMISD::VMOVN, DL, VT,
-            DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
-            DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
-            DAG.getConstant(1, DL, MVT::i32));
-      if (isVMOVNTruncMask(Mask, VT, true))
-        return DAG.getNode(
-            ARMISD::VMOVN, DL, VT,
-            DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
-            DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
-            DAG.getConstant(1, DL, MVT::i32));
-    }
-  }
-
-  // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
-  // truncate to a buildvector to allow the generic optimisations to kick in.
-  if (all_of(N->ops(), [](SDValue Op) {
-        return Op.getOpcode() == ISD::BUILD_VECTOR ||
-               Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
-               (Op.getOpcode() == ISD::BITCAST &&
-                Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
-      })) {
-    SmallVector<SDValue, 8> Extracts;
-    for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
-      SDValue O = N->getOperand(Op);
-      for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
-        SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
-                                  DAG.getConstant(i, DL, MVT::i32));
-        Extracts.push_back(Ext);
-      }
-    }
-    return DAG.getBuildVector(VT, DL, Extracts);
-  }
-
-  // If we are late in the legalization process and nothing has optimised
-  // the trunc to anything better, lower it to a stack store and reload,
-  // performing the truncation whilst keeping the lanes in the correct order:
-  //   VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
-  if (!DCI.isAfterLegalizeDAG())
-    return SDValue();
-
-  SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
-  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
-  int NumIns = N->getNumOperands();
-  assert((NumIns == 2 || NumIns == 4) &&
-         "Expected 2 or 4 inputs to an MVETrunc");
-  EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
-  if (N->getNumOperands() == 4)
-    StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
-
-  SmallVector<SDValue> Chains;
-  for (int I = 0; I < NumIns; I++) {
-    SDValue Ptr = DAG.getNode(
-        ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
-        DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
-    MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
-        DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
-    SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
-                                   Ptr, MPI, StoreVT, Align(4));
-    Chains.push_back(Ch);
-  }
-
-  SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
-  MachinePointerInfo MPI =
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
-  return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
-}
-
-// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
-static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N,
-                                                    SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode());
-  if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
-    return SDValue();
-
-  EVT FromVT = LD->getMemoryVT();
-  EVT ToVT = N->getValueType(0);
-  if (!ToVT.isVector())
-    return SDValue();
-  assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
-  EVT ToEltVT = ToVT.getVectorElementType();
-  EVT FromEltVT = FromVT.getVectorElementType();
-
-  unsigned NumElements = 0;
-  if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
-    NumElements = 4;
-  if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
-    NumElements = 8;
-  assert(NumElements != 0);
-
-  ISD::LoadExtType NewExtType =
-      N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
-  if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
-      LD->getExtensionType() != ISD::EXTLOAD &&
-      LD->getExtensionType() != NewExtType)
-    return SDValue();
-
-  LLVMContext &C = *DAG.getContext();
-  SDLoc DL(LD);
-  // Details about the old load
-  SDValue Ch = LD->getChain();
-  SDValue BasePtr = LD->getBasePtr();
-  Align Alignment = LD->getOriginalAlign();
-  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
-  AAMDNodes AAInfo = LD->getAAInfo();
-
-  SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
-  EVT NewFromVT = EVT::getVectorVT(
-      C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
-  EVT NewToVT = EVT::getVectorVT(
-      C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
-
-  SmallVector<SDValue, 4> Loads;
-  SmallVector<SDValue, 4> Chains;
-  for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
-    unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
-    SDValue NewPtr =
-        DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
-
-    SDValue NewLoad =
-        DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
-                    LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
-                    Alignment, MMOFlags, AAInfo);
-    Loads.push_back(NewLoad);
-    Chains.push_back(SDValue(NewLoad.getNode(), 1));
-  }
-
-  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
-  return DAG.getMergeValues(Loads, DL);
-}
-
-// Perform combines for MVEEXT. If it has not be optimized to anything better
-// before lowering, it gets converted to stack store and extloads performing the
-// extend whilst still keeping the same lane ordering.
-SDValue ARMTargetLowering::PerformMVEExtCombine(
-    SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-  SDLoc DL(N);
-  assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
-  assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
-
-  EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
-      *DAG.getContext());
-  auto Extend = [&](SDValue V) {
-    SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
-    return N->getOpcode() == ARMISD::MVESEXT
-               ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
-                             DAG.getValueType(ExtVT))
-               : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
-  };
-
-  // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
-  if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
-    SDValue Ext = Extend(N->getOperand(0));
-    return DAG.getMergeValues({Ext, Ext}, DL);
-  }
-
-  // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
-  if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
-    ArrayRef<int> Mask = SVN->getMask();
-    assert(Mask.size() == 2 * VT.getVectorNumElements());
-    assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
-    unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
-    SDValue Op0 = SVN->getOperand(0);
-    SDValue Op1 = SVN->getOperand(1);
-
-    auto CheckInregMask = [&](int Start, int Offset) {
-      for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
-        if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
-          return false;
-      return true;
-    };
-    SDValue V0 = SDValue(N, 0);
-    SDValue V1 = SDValue(N, 1);
-    if (CheckInregMask(0, 0))
-      V0 = Extend(Op0);
-    else if (CheckInregMask(0, 1))
-      V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
-    else if (CheckInregMask(0, Mask.size()))
-      V0 = Extend(Op1);
-    else if (CheckInregMask(0, Mask.size() + 1))
-      V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
-
-    if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
-      V1 = Extend(Op1);
-    else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
-      V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
-    else if (CheckInregMask(VT.getVectorNumElements(), 0))
-      V1 = Extend(Op0);
-    else if (CheckInregMask(VT.getVectorNumElements(), 1))
-      V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
-
-    if (V0.getNode() != N || V1.getNode() != N)
-      return DAG.getMergeValues({V0, V1}, DL);
-  }
-
-  // MVEEXT(load) -> extload, extload
-  if (N->getOperand(0)->getOpcode() == ISD::LOAD)
-    if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG))
-      return L;
-
-  if (!DCI.isAfterLegalizeDAG())
-    return SDValue();
-
-  // Lower to a stack store and reload:
-  //  VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
-  SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
-  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
-  int NumOuts = N->getNumValues();
-  assert((NumOuts == 2 || NumOuts == 4) &&
-         "Expected 2 or 4 outputs to an MVEEXT");
-  EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
-      *DAG.getContext());
-  if (N->getNumOperands() == 4)
-    LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
-
-  MachinePointerInfo MPI =
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
-  SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
-                               StackPtr, MPI, Align(4));
-
-  SmallVector<SDValue> Loads;
-  for (int I = 0; I < NumOuts; I++) {
-    SDValue Ptr = DAG.getNode(
-        ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
-        DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
-    MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
-        DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
-    SDValue Load = DAG.getExtLoad(
-        N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
-        VT, Chain, Ptr, MPI, LoadVT, Align(4));
-    Loads.push_back(Load);
-  }
-
-  return DAG.getMergeValues(Loads, DL);
-}
-
 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
-  case ISD::SELECT_CC:
-  case ISD::SELECT:     return PerformSELECTCombine(N, DCI, Subtarget);
-  case ISD::VSELECT:    return PerformVSELECTCombine(N, DCI, Subtarget);
-  case ISD::SETCC:      return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
   case ISD::ABS:        return PerformABSCombine(N, DCI, Subtarget);
   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
   case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
@@ -18677,57 +14632,31 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ARMISD::ADDC:
   case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI, Subtarget);
   case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI, Subtarget);
-  case ARMISD::BFI:     return PerformBFICombine(N, DCI.DAG);
+  case ARMISD::BFI:     return PerformBFICombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
-  case ARMISD::VMOVhr:  return PerformVMOVhrCombine(N, DCI);
-  case ARMISD::VMOVrh:  return PerformVMOVrhCombine(N, DCI.DAG);
   case ISD::STORE:      return PerformSTORECombine(N, DCI, Subtarget);
   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
-  case ISD::EXTRACT_VECTOR_ELT:
-    return PerformExtractEltCombine(N, DCI, Subtarget);
-  case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG);
-  case ISD::INSERT_SUBVECTOR: return PerformInsertSubvectorCombine(N, DCI);
   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
-  case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
-  case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
+  case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
-  case ISD::FADD:
-    return PerformFAddVSelectCombine(N, DCI.DAG, Subtarget);
   case ISD::FDIV:
     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
-  case ISD::INTRINSIC_WO_CHAIN:
-    return PerformIntrinsicCombine(N, DCI);
+  case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
     return PerformShiftCombine(N, DCI, Subtarget);
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
-  case ISD::ANY_EXTEND:
-    return PerformExtendCombine(N, DCI.DAG, Subtarget);
-  case ISD::FP_EXTEND:
-    return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
-  case ISD::SMIN:
-  case ISD::UMIN:
-  case ISD::SMAX:
-  case ISD::UMAX:
-    return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
-  case ARMISD::CMOV:
-    return PerformCMOVCombine(N, DCI.DAG);
-  case ARMISD::BRCOND:
-    return PerformBRCONDCombine(N, DCI.DAG);
-  case ARMISD::CMPZ:
-    return PerformCMPZCombine(N, DCI.DAG);
-  case ARMISD::CSINC:
-  case ARMISD::CSINV:
-  case ARMISD::CSNEG:
-    return PerformCSETCombine(N, DCI.DAG);
-  case ISD::LOAD:
-    return PerformLOADCombine(N, DCI, Subtarget);
+  case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
+  case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
+  case ISD::LOAD:       return PerformLOADCombine(N, DCI);
   case ARMISD::VLD1DUP:
   case ARMISD::VLD2DUP:
   case ARMISD::VLD3DUP:
@@ -18735,30 +14664,10 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformVLDCombine(N, DCI);
   case ARMISD::BUILD_VECTOR:
     return PerformARMBUILD_VECTORCombine(N, DCI);
-  case ISD::BITCAST:
-    return PerformBITCASTCombine(N, DCI, Subtarget);
   case ARMISD::PREDICATE_CAST:
     return PerformPREDICATE_CASTCombine(N, DCI);
-  case ARMISD::VECTOR_REG_CAST:
-    return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
-  case ARMISD::MVETRUNC:
-    return PerformMVETruncCombine(N, DCI);
-  case ARMISD::MVESEXT:
-  case ARMISD::MVEZEXT:
-    return PerformMVEExtCombine(N, DCI);
   case ARMISD::VCMP:
-    return PerformVCMPCombine(N, DCI.DAG, Subtarget);
-  case ISD::VECREDUCE_ADD:
-    return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
-  case ARMISD::VMOVN:
-    return PerformVMOVNCombine(N, DCI);
-  case ARMISD::VQMOVNs:
-  case ARMISD::VQMOVNu:
-    return PerformVQMOVNCombine(N, DCI);
-  case ARMISD::ASRL:
-  case ARMISD::LSRL:
-  case ARMISD::LSLL:
-    return PerformLongShiftCombine(N, DCI.DAG);
+    return PerformVCMPCombine(N, DCI, Subtarget);
   case ARMISD::SMULWB: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
@@ -18775,9 +14684,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   }
   case ARMISD::SMLALBB:
   case ARMISD::QADD16b:
-  case ARMISD::QSUB16b:
-  case ARMISD::UQADD16b:
-  case ARMISD::UQSUB16b: {
+  case ARMISD::QSUB16b: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
@@ -18814,9 +14721,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case ARMISD::QADD8b:
-  case ARMISD::QSUB8b:
-  case ARMISD::UQADD8b:
-  case ARMISD::UQSUB8b: {
+  case ARMISD::QSUB8b: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
@@ -18851,11 +14756,6 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     case Intrinsic::arm_neon_vst3lane:
     case Intrinsic::arm_neon_vst4lane:
       return PerformVLDCombine(N, DCI);
-    case Intrinsic::arm_mve_vld2q:
-    case Intrinsic::arm_mve_vld4q:
-    case Intrinsic::arm_mve_vst2q:
-    case Intrinsic::arm_mve_vst4q:
-      return PerformMVEVLDCombine(N, DCI);
     default: break;
     }
     break;
@@ -18869,9 +14769,9 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
 }
 
 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
-                                                       Align Alignment,
+                                                       unsigned Alignment,
                                                        MachineMemOperand::Flags,
-                                                       unsigned *Fast) const {
+                                                       bool *Fast) const {
   // Depends what it gets converted into if the type is weird.
   if (!VT.isSimple())
     return false;
@@ -18895,7 +14795,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
     // A big-endian target may also explicitly support unaligned accesses
     if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
       if (Fast)
-        *Fast = 1;
+        *Fast = true;
       return true;
     }
   }
@@ -18904,10 +14804,9 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
     return false;
 
   // These are for predicates
-  if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
-       Ty == MVT::v2i1)) {
+  if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) {
     if (Fast)
-      *Fast = 1;
+      *Fast = true;
     return true;
   }
 
@@ -18933,30 +14832,37 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
       Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
       Ty == MVT::v2f64) {
     if (Fast)
-      *Fast = 1;
+      *Fast = true;
     return true;
   }
 
   return false;
 }
 
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+                       unsigned AlignCheck) {
+  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
 
 EVT ARMTargetLowering::getOptimalMemOpType(
-    const MemOp &Op, const AttributeList &FuncAttributes) const {
+    uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+    bool ZeroMemset, bool MemcpyStrSrc,
+    const AttributeList &FuncAttributes) const {
   // See if we can use NEON instructions for this...
-  if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
-      !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
-    unsigned Fast;
-    if (Op.size() >= 16 &&
-        (Op.isAligned(Align(16)) ||
-         (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
+  if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
+      !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
+    bool Fast;
+    if (Size >= 16 &&
+        (memOpAlign(SrcAlign, DstAlign, 16) ||
+         (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1,
                                          MachineMemOperand::MONone, &Fast) &&
           Fast))) {
       return MVT::v2f64;
-    } else if (Op.size() >= 8 &&
-               (Op.isAligned(Align(8)) ||
+    } else if (Size >= 8 &&
+               (memOpAlign(SrcAlign, DstAlign, 8) ||
                 (allowsMisalignedMemoryAccesses(
-                     MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
+                     MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) &&
                  Fast))) {
       return MVT::f64;
     }
@@ -19068,119 +14974,45 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
   if (!Subtarget->hasMVEIntegerOps())
     return false;
 
-  auto IsFMSMul = [&](Instruction *I) {
-    if (!I->hasOneUse())
-      return false;
-    auto *Sub = cast<Instruction>(*I->users().begin());
-    return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
-  };
-  auto IsFMS = [&](Instruction *I) {
-    if (match(I->getOperand(0), m_FNeg(m_Value())) ||
-        match(I->getOperand(1), m_FNeg(m_Value())))
-      return true;
-    return false;
-  };
-
-  auto IsSinker = [&](Instruction *I, int Operand) {
+  auto IsSinker = [](Instruction *I, int Operand) {
     switch (I->getOpcode()) {
     case Instruction::Add:
     case Instruction::Mul:
-    case Instruction::FAdd:
     case Instruction::ICmp:
-    case Instruction::FCmp:
       return true;
-    case Instruction::FMul:
-      return !IsFMSMul(I);
     case Instruction::Sub:
-    case Instruction::FSub:
     case Instruction::Shl:
     case Instruction::LShr:
     case Instruction::AShr:
       return Operand == 1;
-    case Instruction::Call:
-      if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-        switch (II->getIntrinsicID()) {
-        case Intrinsic::fma:
-          return !IsFMS(I);
-        case Intrinsic::sadd_sat:
-        case Intrinsic::uadd_sat:
-        case Intrinsic::arm_mve_add_predicated:
-        case Intrinsic::arm_mve_mul_predicated:
-        case Intrinsic::arm_mve_qadd_predicated:
-        case Intrinsic::arm_mve_vhadd:
-        case Intrinsic::arm_mve_hadd_predicated:
-        case Intrinsic::arm_mve_vqdmull:
-        case Intrinsic::arm_mve_vqdmull_predicated:
-        case Intrinsic::arm_mve_vqdmulh:
-        case Intrinsic::arm_mve_qdmulh_predicated:
-        case Intrinsic::arm_mve_vqrdmulh:
-        case Intrinsic::arm_mve_qrdmulh_predicated:
-        case Intrinsic::arm_mve_fma_predicated:
-          return true;
-        case Intrinsic::ssub_sat:
-        case Intrinsic::usub_sat:
-        case Intrinsic::arm_mve_sub_predicated:
-        case Intrinsic::arm_mve_qsub_predicated:
-        case Intrinsic::arm_mve_hsub_predicated:
-        case Intrinsic::arm_mve_vhsub:
-          return Operand == 1;
-        default:
-          return false;
-        }
-      }
-      return false;
     default:
       return false;
     }
   };
 
-  for (auto OpIdx : enumerate(I->operands())) {
-    Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
-    // Make sure we are not already sinking this operand
-    if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
-      continue;
-
-    Instruction *Shuffle = Op;
-    if (Shuffle->getOpcode() == Instruction::BitCast)
-      Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
-    // We are looking for a splat that can be sunk.
-    if (!Shuffle ||
-        !match(Shuffle, m_Shuffle(
-                            m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
-                            m_Undef(), m_ZeroMask())))
-      continue;
-    if (!IsSinker(I, OpIdx.index()))
-      continue;
-
-    // All uses of the shuffle should be sunk to avoid duplicating it across gpr
-    // and vector registers
-    for (Use &U : Op->uses()) {
-      Instruction *Insn = cast<Instruction>(U.getUser());
-      if (!IsSinker(Insn, U.getOperandNo()))
-        return false;
-    }
-
-    Ops.push_back(&Shuffle->getOperandUse(0));
-    if (Shuffle != Op)
-      Ops.push_back(&Op->getOperandUse(0));
-    Ops.push_back(&OpIdx.value());
+  int Op = 0;
+  if (!isa<ShuffleVectorInst>(I->getOperand(Op)))
+    Op = 1;
+  if (!IsSinker(I, Op))
+    return false;
+  if (!match(I->getOperand(Op),
+             m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()),
+                             m_Undef(), m_Zero()))) {
+    return false;
+  }
+  Instruction *Shuffle = cast<Instruction>(I->getOperand(Op));
+  // All uses of the shuffle should be sunk to avoid duplicating it across gpr
+  // and vector registers
+  for (Use &U : Shuffle->uses()) {
+    Instruction *Insn = cast<Instruction>(U.getUser());
+    if (!IsSinker(Insn, U.getOperandNo()))
+      return false;
   }
+  Ops.push_back(&Shuffle->getOperandUse(0));
+  Ops.push_back(&I->getOperandUse(Op));
   return true;
 }
 
-Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const {
-  if (!Subtarget->hasMVEIntegerOps())
-    return nullptr;
-  Type *SVIType = SVI->getType();
-  Type *ScalarType = SVIType->getScalarType();
-
-  if (ScalarType->isFloatTy())
-    return Type::getInt32Ty(SVIType->getContext());
-  if (ScalarType->isHalfTy())
-    return Type::getInt16Ty(SVIType->getContext());
-  return nullptr;
-}
-
 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   EVT VT = ExtVal.getValueType();
 
@@ -19192,9 +15024,6 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
       return false;
   }
 
-  if (Subtarget->hasMVEIntegerOps())
-    return true;
-
   // Don't create a loadext if we can fold the extension into a wide/long
   // instruction.
   // If there's more than one user instruction, the loadext is desirable no
@@ -19225,6 +15054,17 @@ bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
   return true;
 }
 
+int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
+                                                unsigned AS) const {
+  if (isLegalAddressingMode(DL, AM, Ty, AS)) {
+    if (Subtarget->hasFPAO())
+      return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
+    return 0;
+  }
+  return -1;
+}
+
 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
 /// expanded to FMAs when this method returns true, otherwise fmuladd is
@@ -19521,31 +15361,6 @@ bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
   return AbsImm >= 0 && AbsImm <= 255;
 }
 
-// Return false to prevent folding
-// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
-// if the folding leads to worse code.
-bool ARMTargetLowering::isMulAddWithConstProfitable(SDValue AddNode,
-                                                    SDValue ConstNode) const {
-  // Let the DAGCombiner decide for vector types and large types.
-  const EVT VT = AddNode.getValueType();
-  if (VT.isVector() || VT.getScalarSizeInBits() > 32)
-    return true;
-
-  // It is worse if c0 is legal add immediate, while c1*c0 is not
-  // and has to be composed by at least two instructions.
-  const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
-  const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
-  const int64_t C0 = C0Node->getSExtValue();
-  APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
-  if (!isLegalAddImmediate(C0) || isLegalAddImmediate(CA.getSExtValue()))
-    return true;
-  if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
-    return false;
-
-  // Default to true and let the DAGCombiner decide.
-  return true;
-}
-
 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
                                       bool isSEXTLoad, SDValue &Base,
                                       SDValue &Offset, bool &isInc,
@@ -19630,7 +15445,7 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
   return false;
 }
 
-static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
+static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
                                       bool isSEXTLoad, bool IsMasked, bool isLE,
                                       SDValue &Base, SDValue &Offset,
                                       bool &isInc, SelectionDAG &DAG) {
@@ -19665,16 +15480,16 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
   // (in BE/masked) type.
   Base = Ptr->getOperand(0);
   if (VT == MVT::v4i16) {
-    if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
+    if (Align >= 2 && IsInRange(RHSC, 0x80, 2))
       return true;
   } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
     if (IsInRange(RHSC, 0x80, 1))
       return true;
-  } else if (Alignment >= 4 &&
+  } else if (Align >= 4 &&
              (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
              IsInRange(RHSC, 0x80, 4))
     return true;
-  else if (Alignment >= 2 &&
+  else if (Align >= 2 &&
            (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
            IsInRange(RHSC, 0x80, 2))
     return true;
@@ -19696,28 +15511,28 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 
   EVT VT;
   SDValue Ptr;
-  Align Alignment;
+  unsigned Align;
   bool isSEXTLoad = false;
   bool IsMasked = false;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     Ptr = LD->getBasePtr();
     VT = LD->getMemoryVT();
-    Alignment = LD->getAlign();
+    Align = LD->getAlignment();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     Ptr = ST->getBasePtr();
     VT = ST->getMemoryVT();
-    Alignment = ST->getAlign();
+    Align = ST->getAlignment();
   } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
     Ptr = LD->getBasePtr();
     VT = LD->getMemoryVT();
-    Alignment = LD->getAlign();
+    Align = LD->getAlignment();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
     IsMasked = true;
   } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
     Ptr = ST->getBasePtr();
     VT = ST->getMemoryVT();
-    Alignment = ST->getAlign();
+    Align = ST->getAlignment();
     IsMasked = true;
   } else
     return false;
@@ -19726,9 +15541,9 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   bool isLegal = false;
   if (VT.isVector())
     isLegal = Subtarget->hasMVEIntegerOps() &&
-              getMVEIndexedAddressParts(
-                  Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
-                  Subtarget->isLittle(), Base, Offset, isInc, DAG);
+              getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad,
+                                        IsMasked, Subtarget->isLittle(), Base,
+                                        Offset, isInc, DAG);
   else {
     if (Subtarget->isThumb2())
       isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
@@ -19754,31 +15569,31 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                                    SelectionDAG &DAG) const {
   EVT VT;
   SDValue Ptr;
-  Align Alignment;
+  unsigned Align;
   bool isSEXTLoad = false, isNonExt;
   bool IsMasked = false;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
-    Alignment = LD->getAlign();
+    Align = LD->getAlignment();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     VT = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
-    Alignment = ST->getAlign();
+    Align = ST->getAlignment();
     isNonExt = !ST->isTruncatingStore();
   } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
     VT = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
-    Alignment = LD->getAlign();
+    Align = LD->getAlignment();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
     IsMasked = true;
   } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
     VT = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
-    Alignment = ST->getAlign();
+    Align = ST->getAlignment();
     isNonExt = !ST->isTruncatingStore();
     IsMasked = true;
   } else
@@ -19793,8 +15608,6 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
     auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
     if (!RHS || RHS->getZExtValue() != 4)
       return false;
-    if (Alignment < Align(4))
-      return false;
 
     Offset = Op->getOperand(1);
     Base = Op->getOperand(0);
@@ -19806,7 +15619,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
   bool isLegal = false;
   if (VT.isVector())
     isLegal = Subtarget->hasMVEIntegerOps() &&
-              getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
+              getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked,
                                         Subtarget->isLittle(), Base, Offset,
                                         isInc, DAG);
   else {
@@ -19868,7 +15681,8 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       return;
 
     KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
-    Known = KnownBits::commonBits(Known, KnownRHS);
+    Known.Zero &= KnownRHS.Zero;
+    Known.One  &= KnownRHS.One;
     return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
@@ -19920,45 +15734,18 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     if (Op.getOpcode() == ARMISD::VGETLANEs)
       Known = Known.sext(DstSz);
     else {
-      Known = Known.zext(DstSz);
+      Known = Known.zext(DstSz, true /* extended bits are known zero */);
     }
     assert(DstSz == Known.getBitWidth());
     break;
   }
-  case ARMISD::VMOVrh: {
-    KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
-    assert(KnownOp.getBitWidth() == 16);
-    Known = KnownOp.zext(32);
-    break;
-  }
-  case ARMISD::CSINC:
-  case ARMISD::CSINV:
-  case ARMISD::CSNEG: {
-    KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
-    KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
-
-    // The result is either:
-    // CSINC: KnownOp0 or KnownOp1 + 1
-    // CSINV: KnownOp0 or ~KnownOp1
-    // CSNEG: KnownOp0 or KnownOp1 * -1
-    if (Op.getOpcode() == ARMISD::CSINC)
-      KnownOp1 = KnownBits::computeForAddSub(
-          true, false, KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
-    else if (Op.getOpcode() == ARMISD::CSINV)
-      std::swap(KnownOp1.Zero, KnownOp1.One);
-    else if (Op.getOpcode() == ARMISD::CSNEG)
-      KnownOp1 = KnownBits::mul(
-          KnownOp1, KnownBits::makeConstant(APInt(32, -1)));
-
-    Known = KnownBits::commonBits(KnownOp0, KnownOp1);
-    break;
-  }
   }
 }
 
-bool ARMTargetLowering::targetShrinkDemandedConstant(
-    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
-    TargetLoweringOpt &TLO) const {
+bool
+ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
+                                                const APInt &DemandedAPInt,
+                                                TargetLoweringOpt &TLO) const {
   // Delay optimization, so we don't have to deal with illegal types, or block
   // optimizations.
   if (!TLO.LegalOps)
@@ -19983,7 +15770,7 @@ bool ARMTargetLowering::targetShrinkDemandedConstant(
 
   unsigned Mask = C->getZExtValue();
 
-  unsigned Demanded = DemandedBits.getZExtValue();
+  unsigned Demanded = DemandedAPInt.getZExtValue();
   unsigned ShrunkMask = Mask & Demanded;
   unsigned ExpandedMask = Mask | ~Demanded;
 
@@ -20038,43 +15825,6 @@ bool ARMTargetLowering::targetShrinkDemandedConstant(
   return false;
 }
 
-bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
-    SDValue Op, const APInt &OriginalDemandedBits,
-    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
-    unsigned Depth) const {
-  unsigned Opc = Op.getOpcode();
-
-  switch (Opc) {
-  case ARMISD::ASRL:
-  case ARMISD::LSRL: {
-    // If this is result 0 and the other result is unused, see if the demand
-    // bits allow us to shrink this long shift into a standard small shift in
-    // the opposite direction.
-    if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
-        isa<ConstantSDNode>(Op->getOperand(2))) {
-      unsigned ShAmt = Op->getConstantOperandVal(2);
-      if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
-                                                        << (32 - ShAmt)))
-        return TLO.CombineTo(
-            Op, TLO.DAG.getNode(
-                    ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
-                    TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
-    }
-    break;
-  }
-  case ARMISD::VBICIMM: {
-    SDValue Op0 = Op.getOperand(0);
-    unsigned ModImm = Op.getConstantOperandVal(1);
-    unsigned EltBits = 0;
-    uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
-    if ((OriginalDemandedBits & Mask) == 0)
-      return TLO.CombineTo(Op, Op0);
-  }
-  }
-
-  return TargetLowering::SimplifyDemandedBitsForTargetNode(
-      Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
-}
 
 //===----------------------------------------------------------------------===//
 //                           ARM Inline Assembly Support
@@ -20085,7 +15835,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
   if (!Subtarget->hasV6Ops())
     return false;
 
-  InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
   std::string AsmStr = IA->getAsmString();
   SmallVector<StringRef, 4> AsmPieces;
   SplitString(AsmStr, AsmPieces, ";\n");
@@ -20093,7 +15843,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
   switch (AsmPieces.size()) {
   default: return false;
   case 1:
-    AsmStr = std::string(AsmPieces[0]);
+    AsmStr = AsmPieces[0];
     AsmPieces.clear();
     SplitString(AsmStr, AsmPieces, " \t,");
 
@@ -20217,8 +15967,6 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
     case 'w':
       if (VT == MVT::Other)
         break;
-      if (VT == MVT::f16 || VT == MVT::bf16)
-        return RCPair(0U, &ARM::HPRRegClass);
       if (VT == MVT::f32)
         return RCPair(0U, &ARM::SPRRegClass);
       if (VT.getSizeInBits() == 64)
@@ -20239,8 +15987,6 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
     case 't':
       if (VT == MVT::Other)
         break;
-      if (VT == MVT::f16 || VT == MVT::bf16)
-        return RCPair(0U, &ARM::HPRRegClass);
       if (VT == MVT::f32 || VT == MVT::i32)
         return RCPair(0U, &ARM::SPRRegClass);
       if (VT.getSizeInBits() == 64)
@@ -20268,7 +16014,7 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
     break;
   }
 
-  if (StringRef("{cc}").equals_insensitive(Constraint))
+  if (StringRef("{cc}").equals_lower(Constraint))
     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
 
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
@@ -20492,21 +16238,8 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
          "Invalid opcode for Div/Rem lowering");
   bool isSigned = (Opcode == ISD::SDIVREM);
   EVT VT = Op->getValueType(0);
-  SDLoc dl(Op);
-
-  if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
-    SmallVector<SDValue> Result;
-    if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
-        SDValue Res0 =
-            DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
-        SDValue Res1 =
-            DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
-        return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
-                           {Res0, Res1});
-    }
-  }
-
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+  SDLoc dl(Op);
 
   // If the target has hardware divide, use divide + multiply + subtract:
   //     div = a / b
@@ -20556,20 +16289,11 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
 // Lowers REM using divmod helpers
 // see RTABI section 4.2/4.3
 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
-  EVT VT = N->getValueType(0);
-
-  if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
-    SmallVector<SDValue> Result;
-    if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
-        return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
-                           Result[0], Result[1]);
-  }
-
   // Build return types (div and rem)
   std::vector<Type*> RetTyParams;
   Type *RetTyElement;
 
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (N->getValueType(0).getSimpleVT().SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
   case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
@@ -20618,15 +16342,13 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
 
   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
           "no-stack-arg-probe")) {
-    MaybeAlign Align =
-        cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+    unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
     SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
     Chain = SP.getValue(1);
     SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
     if (Align)
-      SP =
-          DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
-                      DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));
+      SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
     Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
     SDValue Ops[2] = { SP, Chain };
     return DAG.getMergeValues(Ops, DL);
@@ -20741,6 +16463,38 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
 }
 
+void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                 SelectionDAG &DAG) const {
+  assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
+  MVT HalfT = MVT::i32;
+  SDLoc dl(N);
+  SDValue Hi, Lo, Tmp;
+
+  if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) ||
+      !isOperationLegalOrCustom(ISD::UADDO, HalfT))
+    return ;
+
+  unsigned OpTypeBits = HalfT.getScalarSizeInBits();
+  SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
+
+  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+                   DAG.getConstant(0, dl, HalfT));
+  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+                   DAG.getConstant(1, dl, HalfT));
+
+  Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
+                    DAG.getConstant(OpTypeBits - 1, dl,
+                    getShiftAmountTy(HalfT, DAG.getDataLayout())));
+  Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
+  Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
+                   SDValue(Lo.getNode(), 1));
+  Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
+  Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
+
+  Results.push_back(Lo);
+  Results.push_back(Hi);
+}
+
 bool
 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The ARM target isn't yet aware of offsets.
@@ -20765,9 +16519,6 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     return false;
   if (VT == MVT::f16 && Subtarget->hasFullFP16())
     return ARM_AM::getFP16Imm(Imm) != -1;
-  if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
-      ARM_AM::getFP32FP16Imm(Imm) != -1)
-    return true;
   if (VT == MVT::f32)
     return ARM_AM::getFP32Imm(Imm) != -1;
   if (VT == MVT::f64 && Subtarget->hasFP64())
@@ -20800,8 +16551,8 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
-    Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
     // volatile loads with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOLoad;
     return true;
@@ -20814,7 +16565,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
     Info.align.reset();
     // volatile loads with NEON intrinsics not supported
@@ -20832,7 +16583,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // Conservatively set memVT to the entire set of vectors stored.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
@@ -20841,8 +16592,8 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
-    Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
     // volatile stores with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOStore;
     return true;
@@ -20854,7 +16605,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // Conservatively set memVT to the entire set of vectors stored.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
@@ -20868,115 +16619,27 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
-  case Intrinsic::arm_mve_vld2q:
-  case Intrinsic::arm_mve_vld4q: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    // Conservatively set memVT to the entire set of vectors loaded.
-    Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
-    unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
-    Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.align = Align(VecTy->getScalarSizeInBits() / 8);
-    // volatile loads with MVE intrinsics not supported
-    Info.flags = MachineMemOperand::MOLoad;
-    return true;
-  }
-  case Intrinsic::arm_mve_vst2q:
-  case Intrinsic::arm_mve_vst4q: {
-    Info.opc = ISD::INTRINSIC_VOID;
-    // Conservatively set memVT to the entire set of vectors stored.
-    Type *VecTy = I.getArgOperand(1)->getType();
-    unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
-    Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.align = Align(VecTy->getScalarSizeInBits() / 8);
-    // volatile stores with MVE intrinsics not supported
-    Info.flags = MachineMemOperand::MOStore;
-    return true;
-  }
-  case Intrinsic::arm_mve_vldr_gather_base:
-  case Intrinsic::arm_mve_vldr_gather_base_predicated: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.ptrVal = nullptr;
-    Info.memVT = MVT::getVT(I.getType());
-    Info.align = Align(1);
-    Info.flags |= MachineMemOperand::MOLoad;
-    return true;
-  }
-  case Intrinsic::arm_mve_vldr_gather_base_wb:
-  case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.ptrVal = nullptr;
-    Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
-    Info.align = Align(1);
-    Info.flags |= MachineMemOperand::MOLoad;
-    return true;
-  }
-  case Intrinsic::arm_mve_vldr_gather_offset:
-  case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.ptrVal = nullptr;
-    MVT DataVT = MVT::getVT(I.getType());
-    unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
-    Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
-                                  DataVT.getVectorNumElements());
-    Info.align = Align(1);
-    Info.flags |= MachineMemOperand::MOLoad;
-    return true;
-  }
-  case Intrinsic::arm_mve_vstr_scatter_base:
-  case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
-    Info.opc = ISD::INTRINSIC_VOID;
-    Info.ptrVal = nullptr;
-    Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
-    Info.align = Align(1);
-    Info.flags |= MachineMemOperand::MOStore;
-    return true;
-  }
-  case Intrinsic::arm_mve_vstr_scatter_base_wb:
-  case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.ptrVal = nullptr;
-    Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
-    Info.align = Align(1);
-    Info.flags |= MachineMemOperand::MOStore;
-    return true;
-  }
-  case Intrinsic::arm_mve_vstr_scatter_offset:
-  case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
-    Info.opc = ISD::INTRINSIC_VOID;
-    Info.ptrVal = nullptr;
-    MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
-    unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
-    Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
-                                  DataVT.getVectorNumElements());
-    Info.align = Align(1);
-    Info.flags |= MachineMemOperand::MOStore;
-    return true;
-  }
   case Intrinsic::arm_ldaex:
   case Intrinsic::arm_ldrex: {
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
-    Type *ValTy = I.getParamElementType(0);
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(ValTy);
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(ValTy);
+    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   }
   case Intrinsic::arm_stlex:
   case Intrinsic::arm_strex: {
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
-    Type *ValTy = I.getParamElementType(1);
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(ValTy);
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(ValTy);
+    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -21027,7 +16690,7 @@ bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   return (Index == 0 || Index == ResVT.getVectorNumElements());
 }
 
-Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
+Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
                                         ARM_MB::MemBOpt Domain) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
 
@@ -21057,7 +16720,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
 }
 
 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
+Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
                                                  Instruction *Inst,
                                                  AtomicOrdering Ord) const {
   switch (Ord) {
@@ -21070,7 +16733,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
   case AtomicOrdering::SequentiallyConsistent:
     if (!Inst->hasAtomicStore())
       return nullptr; // Nothing to do
-    [[fallthrough]];
+    LLVM_FALLTHROUGH;
   case AtomicOrdering::Release:
   case AtomicOrdering::AcquireRelease:
     if (Subtarget->preferISHSTBarriers())
@@ -21082,7 +16745,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
 }
 
-Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
+Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
                                                   Instruction *Inst,
                                                   AtomicOrdering Ord) const {
   switch (Ord) {
@@ -21104,19 +16767,9 @@ Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
 // anything for those.
-TargetLoweringBase::AtomicExpansionKind
-ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
-  bool has64BitAtomicStore;
-  if (Subtarget->isMClass())
-    has64BitAtomicStore = false;
-  else if (Subtarget->isThumb())
-    has64BitAtomicStore = Subtarget->hasV7Ops();
-  else
-    has64BitAtomicStore = Subtarget->hasV6Ops();
-
+bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
-  return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
-                                           : AtomicExpansionKind::None;
+  return (Size == 64) && !Subtarget->isMClass();
 }
 
 // Loads and stores less than 64-bits are already atomic; ones above that
@@ -21128,17 +16781,9 @@ ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 // sections A8.8.72-74 LDRD)
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
-  bool has64BitAtomicLoad;
-  if (Subtarget->isMClass())
-    has64BitAtomicLoad = false;
-  else if (Subtarget->isThumb())
-    has64BitAtomicLoad = Subtarget->hasV7Ops();
-  else
-    has64BitAtomicLoad = Subtarget->hasV6Ops();
-
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
-  return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
-                                            : AtomicExpansionKind::None;
+  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
+                                                  : AtomicExpansionKind::None;
 }
 
 // For the real atomic operations, we have ldrex/strex up to 32 bits,
@@ -21149,28 +16794,12 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
     return AtomicExpansionKind::CmpXChg;
 
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  bool hasAtomicRMW;
-  if (Subtarget->isMClass())
-    hasAtomicRMW = Subtarget->hasV8MBaselineOps();
-  else if (Subtarget->isThumb())
-    hasAtomicRMW = Subtarget->hasV7Ops();
-  else
-    hasAtomicRMW = Subtarget->hasV6Ops();
-  if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
-    // At -O0, fast-regalloc cannot cope with the live vregs necessary to
-    // implement atomicrmw without spilling. If the target address is also on
-    // the stack and close enough to the spill slot, this can lead to a
-    // situation where the monitor always gets cleared and the atomic operation
-    // can never succeed. So at -O0 lower this operation to a CAS loop.
-    if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
-      return AtomicExpansionKind::CmpXChg;
-    return AtomicExpansionKind::LLSC;
-  }
-  return AtomicExpansionKind::None;
+  bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
+  return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
+             ? AtomicExpansionKind::LLSC
+             : AtomicExpansionKind::None;
 }
 
-// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used  up to 32
-// bits, and up to 64 bits on the non-M profiles.
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
@@ -21178,16 +16807,9 @@ ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
   // on the stack and close enough to the spill slot, this can lead to a
   // situation where the monitor always gets cleared and the atomic operation
   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
-  unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
-  bool HasAtomicCmpXchg;
-  if (Subtarget->isMClass())
-    HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
-  else if (Subtarget->isThumb())
-    HasAtomicCmpXchg = Subtarget->hasV7Ops();
-  else
-    HasAtomicCmpXchg = Subtarget->hasV6Ops();
-  if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg &&
-      Size <= (Subtarget->isMClass() ? 32U : 64U))
+  bool HasAtomicCmpXchg =
+      !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
+  if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg)
     return AtomicExpansionKind::LLSC;
   return AtomicExpansionKind::None;
 }
@@ -21197,9 +16819,9 @@ bool ARMTargetLowering::shouldInsertFencesForAtomic(
   return InsertFencesForAtomic;
 }
 
+// This has so far only been implemented for MachO.
 bool ARMTargetLowering::useLoadStackGuardNode() const {
-  // ROPI/RWPI are not supported currently.
-  return !Subtarget->isROPI() && !Subtarget->isRWPI();
+  return Subtarget->isTargetMachO();
 }
 
 void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
@@ -21215,7 +16837,7 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
       "__security_check_cookie", Type::getVoidTy(M.getContext()),
       Type::getInt8PtrTy(M.getContext()));
   if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
-    F->addParamAttr(0, Attribute::AttrKind::InReg);
+    F->addAttribute(1, Attribute::AttrKind::InReg);
 }
 
 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
@@ -21251,7 +16873,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
     return false;
 
   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
-  unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
+  unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
   // We can do a store + vector extract on any vector that fits perfectly in a D
   // or Q register.
   if (BitWidth == 64 || BitWidth == 128) {
@@ -21261,48 +16883,28 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
   return false;
 }
 
-bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
+bool ARMTargetLowering::isCheapToSpeculateCttz() const {
   return Subtarget->hasV6T2Ops();
 }
 
-bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
+bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
   return Subtarget->hasV6T2Ops();
 }
 
-bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial(
-    const Instruction &AndI) const {
-  if (!Subtarget->hasV7Ops())
-    return false;
-
-  // Sink the `and` instruction only if the mask would fit into a modified
-  // immediate operand.
-  ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
-  if (!Mask || Mask->getValue().getBitWidth() > 32u)
-    return false;
-  auto MaskVal = unsigned(Mask->getValue().getZExtValue());
-  return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
-                                : ARM_AM::getSOImmVal(MaskVal)) != -1;
+bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
+  return !Subtarget->hasMinSize() || Subtarget->isTargetWindows();
 }
 
-TargetLowering::ShiftLegalizationStrategy
-ARMTargetLowering::preferredShiftLegalizationStrategy(
-    SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
-  if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
-    return ShiftLegalizationStrategy::LowerToLibcall;
-  return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
-                                                            ExpansionFactor);
-}
-
-Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
-                                         Value *Addr,
+Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                          AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
   bool IsAcquire = isAcquireOrStronger(Ord);
 
   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
   // intrinsic must return {i32, i32} and we have to recombine them into a
   // single i64 here.
-  if (ValueTy->getPrimitiveSizeInBits() == 64) {
+  if (ValTy->getPrimitiveSizeInBits() == 64) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
     Function *Ldrex = Intrinsic::getDeclaration(M, Int);
@@ -21314,32 +16916,31 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
     if (!Subtarget->isLittle())
       std::swap (Lo, Hi);
-    Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
-    Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
     return Builder.CreateOr(
-        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
   }
 
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
   Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
-  CallInst *CI = Builder.CreateCall(Ldrex, Addr);
 
-  CI->addParamAttr(
-      0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
-  return Builder.CreateTruncOrBitCast(CI, ValueTy);
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldrex, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
 }
 
 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
-    IRBuilderBase &Builder) const {
+    IRBuilder<> &Builder) const {
   if (!Subtarget->hasV7Ops())
     return;
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
 }
 
-Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
-                                               Value *Val, Value *Addr,
+Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                                               Value *Addr,
                                                AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   bool IsRelease = isReleaseOrStronger(Ord);
@@ -21365,13 +16966,10 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
   Type *Tys[] = { Addr->getType() };
   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
 
-  CallInst *CI = Builder.CreateCall(
+  return Builder.CreateCall(
       Strex, {Builder.CreateZExtOrBitCast(
                   Val, Strex->getFunctionType()->getParamType(0)),
               Addr});
-  CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
-                                     Val->getType()));
-  return CI;
 }
 
 
@@ -21388,8 +16986,7 @@ ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
 }
 
 bool ARMTargetLowering::isLegalInterleavedAccessType(
-    unsigned Factor, FixedVectorType *VecTy, Align Alignment,
-    const DataLayout &DL) const {
+    unsigned Factor, VectorType *VecTy, const DataLayout &DL) const {
 
   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
@@ -21412,9 +17009,6 @@ bool ARMTargetLowering::isLegalInterleavedAccessType(
   // Ensure the element type is legal.
   if (ElSize != 8 && ElSize != 16 && ElSize != 32)
     return false;
-  // And the alignment if high enough under MVE.
-  if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
-    return false;
 
   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
   // 128 will be split into multiple interleaved accesses.
@@ -21451,16 +17045,15 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
-  auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
-  Type *EltTy = VecTy->getElementType();
+  VectorType *VecTy = Shuffles[0]->getType();
+  Type *EltTy = VecTy->getVectorElementType();
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
-  Align Alignment = LI->getAlign();
 
   // Skip if we do not have NEON and skip illegal vector types. We can
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
-  if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
+  if (!isLegalInterleavedAccessType(Factor, VecTy, DL))
     return false;
 
   unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
@@ -21468,7 +17061,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   if (EltTy->isPointerTy())
-    VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
+    VecTy =
+        VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
   IRBuilder<> Builder(LI);
 
@@ -21478,15 +17072,15 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   if (NumLoads > 1) {
     // If we're going to generate more than one load, reset the sub-vector type
     // to something legal.
-    VecTy = FixedVectorType::get(VecTy->getElementType(),
-                                 VecTy->getNumElements() / NumLoads);
+    VecTy = VectorType::get(VecTy->getVectorElementType(),
+                            VecTy->getVectorNumElements() / NumLoads);
 
     // We will compute the pointer operand of each load from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
     BaseAddr = Builder.CreateBitCast(
-        BaseAddr,
-        VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
+        BaseAddr, VecTy->getVectorElementType()->getPointerTo(
+                      LI->getPointerAddressSpace()));
   }
 
   assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
@@ -21503,7 +17097,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
 
       SmallVector<Value *, 2> Ops;
       Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
-      Ops.push_back(Builder.getInt32(LI->getAlign().value()));
+      Ops.push_back(Builder.getInt32(LI->getAlignment()));
 
       return Builder.CreateCall(VldnFunc, Ops, "vldN");
     } else {
@@ -21511,8 +17105,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
              "expected interleave factor of 2 or 4 for MVE");
       Intrinsic::ID LoadInts =
           Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
-      Type *VecEltTy =
-          VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace());
+      Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo(
+          LI->getPointerAddressSpace());
       Type *Tys[] = {VecTy, VecEltTy};
       Function *VldnFunc =
           Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
@@ -21532,8 +17126,9 @@ bool ARMTargetLowering::lowerInterleavedLoad(
     // If we're generating more than one load, compute the base address of
     // subsequent loads as an offset from the previous.
     if (LoadCount > 0)
-      BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
-                                            VecTy->getNumElements() * Factor);
+      BaseAddr =
+          Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
+                                     VecTy->getVectorNumElements() * Factor);
 
     CallInst *VldN = createLoadIntrinsic(BaseAddr);
 
@@ -21548,8 +17143,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
       // Convert the integer vector to pointer vector if the element is pointer.
       if (EltTy->isPointerTy())
         SubVec = Builder.CreateIntToPtr(
-            SubVec,
-            FixedVectorType::get(SV->getType()->getElementType(), VecTy));
+            SubVec, VectorType::get(SV->getType()->getVectorElementType(),
+                                    VecTy->getVectorNumElements()));
 
       SubVecs[SV].push_back(SubVec);
     }
@@ -21601,20 +17196,20 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
-  auto *VecTy = cast<FixedVectorType>(SVI->getType());
-  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
+  VectorType *VecTy = SVI->getType();
+  assert(VecTy->getVectorNumElements() % Factor == 0 &&
+         "Invalid interleaved store");
 
-  unsigned LaneLen = VecTy->getNumElements() / Factor;
-  Type *EltTy = VecTy->getElementType();
-  auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
+  unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
+  Type *EltTy = VecTy->getVectorElementType();
+  VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  Align Alignment = SI->getAlign();
 
   // Skip if we do not have NEON and skip illegal vector types. We can
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
-  if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
+  if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL))
     return false;
 
   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
@@ -21629,12 +17224,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
     Type *IntTy = DL.getIntPtrType(EltTy);
 
     // Convert to the corresponding integer vector.
-    auto *IntVecTy =
-        FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType()));
+    Type *IntVecTy =
+        VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
 
-    SubVecTy = FixedVectorType::get(IntTy, LaneLen);
+    SubVecTy = VectorType::get(IntTy, LaneLen);
   }
 
   // The base address of the store.
@@ -21644,14 +17239,14 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
     // If we're going to generate more than one store, reset the lane length
     // and sub-vector type to something legal.
     LaneLen /= NumStores;
-    SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
+    SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
 
     // We will compute the pointer operand of each store from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
     BaseAddr = Builder.CreateBitCast(
-        BaseAddr,
-        SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
+        BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
+                      SI->getPointerAddressSpace()));
   }
 
   assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
@@ -21672,15 +17267,16 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
 
       SmallVector<Value *, 6> Ops;
       Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
-      append_range(Ops, Shuffles);
-      Ops.push_back(Builder.getInt32(SI->getAlign().value()));
+      for (auto S : Shuffles)
+        Ops.push_back(S);
+      Ops.push_back(Builder.getInt32(SI->getAlignment()));
       Builder.CreateCall(VstNFunc, Ops);
     } else {
       assert((Factor == 2 || Factor == 4) &&
              "expected interleave factor of 2 or 4 for MVE");
       Intrinsic::ID StoreInts =
           Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
-      Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo(
+      Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo(
           SI->getPointerAddressSpace());
       Type *Tys[] = {EltPtrTy, SubVecTy};
       Function *VstNFunc =
@@ -21688,7 +17284,8 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
 
       SmallVector<Value *, 6> Ops;
       Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy));
-      append_range(Ops, Shuffles);
+      for (auto S : Shuffles)
+        Ops.push_back(S);
       for (unsigned F = 0; F < Factor; F++) {
         Ops.push_back(Builder.getInt32(F));
         Builder.CreateCall(VstNFunc, Ops);
@@ -21701,7 +17298,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
     // If we generating more than one store, we compute the base address of
     // subsequent stores as an offset from the previous.
     if (StoreCount > 0)
-      BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
+      BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
                                             BaseAddr, LaneLen * Factor);
 
     SmallVector<Value *, 4> Shuffles;
@@ -21711,7 +17308,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
       unsigned IdxI = StoreCount * LaneLen * Factor + i;
       if (Mask[IdxI] >= 0) {
         Shuffles.push_back(Builder.CreateShuffleVector(
-            Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
+            Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
       } else {
         unsigned StartMask = 0;
         for (unsigned j = 1; j < LaneLen; j++) {
@@ -21728,7 +17325,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
         // Note: StartMask cannot be negative, it's checked in
         // isReInterleaveMask
         Shuffles.push_back(Builder.CreateShuffleVector(
-            Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
+            Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
       }
     }
 
@@ -21776,11 +17373,11 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
     case HA_DOUBLE:
       return false;
     case HA_VECT64:
-      return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
+      return VT->getBitWidth() == 64;
     case HA_VECT128:
-      return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
+      return VT->getBitWidth() == 128;
     case HA_UNKNOWN:
-      switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
+      switch (VT->getBitWidth()) {
       case 64:
         Base = HA_VECT64;
         return true;
@@ -21797,9 +17394,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
 }
 
 /// Return the correct alignment for the current calling convention.
-Align ARMTargetLowering::getABIAlignmentForCallingConv(
-    Type *ArgTy, const DataLayout &DL) const {
-  const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
+Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
+                                                       DataLayout DL) const {
+  const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy));
   if (!ArgTy->isVectorTy())
     return ABITypeAlign;
 
@@ -21812,8 +17409,7 @@ Align ARMTargetLowering::getABIAlignmentForCallingConv(
 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
 /// passing according to AAPCS rules.
 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
-    Type *Ty, CallingConv::ID CallConv, bool isVarArg,
-    const DataLayout &DL) const {
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
   if (getEffectiveCallingConv(CallConv, isVarArg) !=
       CallingConv::ARM_AAPCS_VFP)
     return false;
@@ -21827,18 +17423,18 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
   return IsHA || IsIntArray;
 }
 
-Register ARMTargetLowering::getExceptionPointerRegister(
+unsigned ARMTargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   // Platforms which do not use SjLj EH may return values in these registers
   // via the personality function.
-  return Subtarget->useSjLjEH() ? Register() : ARM::R0;
+  return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
 }
 
-Register ARMTargetLowering::getExceptionSelectorRegister(
+unsigned ARMTargetLowering::getExceptionSelectorRegister(
     const Constant *PersonalityFn) const {
   // Platforms which do not use SjLj EH may return values in these registers
   // via the personality function.
-  return Subtarget->useSjLjEH() ? Register() : ARM::R1;
+  return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
 }
 
 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
@@ -21892,105 +17488,3 @@ void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
   MF.getFrameInfo().computeMaxCallFrameSize(MF);
   TargetLoweringBase::finalizeLowering(MF);
 }
-
-bool ARMTargetLowering::isComplexDeinterleavingSupported() const {
-  return Subtarget->hasMVEIntegerOps();
-}
-
-bool ARMTargetLowering::isComplexDeinterleavingOperationSupported(
-    ComplexDeinterleavingOperation Operation, Type *Ty) const {
-  auto *VTy = dyn_cast<FixedVectorType>(Ty);
-  if (!VTy)
-    return false;
-
-  auto *ScalarTy = VTy->getScalarType();
-  unsigned NumElements = VTy->getNumElements();
-
-  unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
-  if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
-    return false;
-
-  // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
-  if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
-    return Subtarget->hasMVEFloatOps();
-
-  if (Operation != ComplexDeinterleavingOperation::CAdd)
-    return false;
-
-  return Subtarget->hasMVEIntegerOps() &&
-         (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
-          ScalarTy->isIntegerTy(32));
-}
-
-Value *ARMTargetLowering::createComplexDeinterleavingIR(
-    Instruction *I, ComplexDeinterleavingOperation OperationType,
-    ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
-    Value *Accumulator) const {
-
-  FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
-
-  IRBuilder<> B(I);
-
-  unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
-
-  assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
-
-  if (TyWidth > 128) {
-    int Stride = Ty->getNumElements() / 2;
-    auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
-    auto SplitSeqVec = llvm::to_vector(SplitSeq);
-    ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
-    ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
-
-    auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
-    auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
-    auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
-    auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
-    Value *LowerSplitAcc = nullptr;
-    Value *UpperSplitAcc = nullptr;
-
-    if (Accumulator) {
-      LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
-      UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
-    }
-
-    auto *LowerSplitInt = createComplexDeinterleavingIR(
-        I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
-    auto *UpperSplitInt = createComplexDeinterleavingIR(
-        I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
-
-    ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
-    return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
-  }
-
-  auto *IntTy = Type::getInt32Ty(B.getContext());
-
-  ConstantInt *ConstRotation = nullptr;
-  if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
-    ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
-
-    if (Accumulator)
-      return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
-                               {ConstRotation, Accumulator, InputB, InputA});
-    return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
-                             {ConstRotation, InputB, InputA});
-  }
-
-  if (OperationType == ComplexDeinterleavingOperation::CAdd) {
-    // 1 means the value is not halved.
-    auto *ConstHalving = ConstantInt::get(IntTy, 1);
-
-    if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
-      ConstRotation = ConstantInt::get(IntTy, 0);
-    else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
-      ConstRotation = ConstantInt::get(IntTy, 1);
-
-    if (!ConstRotation)
-      return nullptr; // Invalid rotation for arm_mve_vcaddq
-
-    return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
-                             {ConstHalving, ConstRotation, InputA, InputB});
-  }
-
-  return nullptr;
-}