diff options
Diffstat (limited to 'lib/Target/ARM')
-rw-r--r-- | lib/Target/ARM/ARM.td | 4 | ||||
-rw-r--r-- | lib/Target/ARM/ARMAddressingModes.h | 5 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 466 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.h | 21 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrInfo.td | 41 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrNEON.td | 145 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrThumb2.td | 37 | ||||
-rw-r--r-- | lib/Target/ARM/ARMSubtarget.h | 4 | ||||
-rw-r--r-- | lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 14 | ||||
-rw-r--r-- | lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp | 6 | ||||
-rw-r--r-- | lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp | 2 | ||||
-rw-r--r-- | lib/Target/ARM/README.txt | 67 |
12 files changed, 517 insertions, 295 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index f1e6a9f..fa64d6c 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -48,6 +48,8 @@ def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", "Enable divide instructions">; def FeatureT2ExtractPack: SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true", "Enable Thumb2 extract and pack instructions">; +def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", + "FP compare + branch is slow">; // Some processors have multiply-accumulate instructions that don't // play nicely with other VFP instructions, and it's generally better @@ -129,7 +131,7 @@ def : Processor<"arm1156t2f-s", ARMV6Itineraries, // V7 Processors. def : Processor<"cortex-a8", CortexA8Itineraries, [ArchV7A, FeatureThumb2, FeatureNEON, FeatureHasSlowVMLx, - FeatureNEONForFP, FeatureT2ExtractPack]>; + FeatureSlowFPBrcc, FeatureNEONForFP, FeatureT2ExtractPack]>; def : Processor<"cortex-a9", CortexA9Itineraries, [ArchV7A, FeatureThumb2, FeatureNEON, FeatureT2ExtractPack]>; def : ProcNoItin<"cortex-m3", [ArchV7M, FeatureThumb2, FeatureHWDiv]>; diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h index d316b13..92a13f1 100644 --- a/lib/Target/ARM/ARMAddressingModes.h +++ b/lib/Target/ARM/ARMAddressingModes.h @@ -519,9 +519,8 @@ namespace ARM_AM { // // This is stored in two operands [regaddr, align]. The first is the // address register. The second operand is the value of the alignment - // specifier to use or zero if no explicit alignment. - // Valid alignments are: 0, 8, 16, and 32 bytes, depending on the specific - // instruction. + // specifier in bytes or zero if no explicit alignment. + // Valid alignments depend on the specific instruction. //===--------------------------------------------------------------------===// // NEON Modified Immediates diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 98d8b85..0091df7 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -565,6 +565,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CMPZ: return "ARMISD::CMPZ"; case ARMISD::CMPFP: return "ARMISD::CMPFP"; case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; + case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; case ARMISD::CMOV: return "ARMISD::CMOV"; case ARMISD::CNEG: return "ARMISD::CNEG"; @@ -623,6 +624,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; + case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; + case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; case ARMISD::VDUP: return "ARMISD::VDUP"; case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; case ARMISD::VEXT: return "ARMISD::VEXT"; @@ -2216,7 +2219,7 @@ static bool isFloatingPointZero(SDValue Op) { /// the given operands. SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &ARMCC, SelectionDAG &DAG, + SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const { if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); @@ -2268,48 +2271,14 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, CompareType = ARMISD::CMPZ; break; } - ARMCC = DAG.getConstant(CondCode, MVT::i32); + ARMcc = DAG.getConstant(CondCode, MVT::i32); return DAG.getNode(CompareType, dl, MVT::Flag, LHS, RHS); } -static bool canBitcastToInt(SDNode *Op) { - return Op->hasOneUse() && - ISD::isNormalLoad(Op) && - Op->getValueType(0) == MVT::f32; -} - -static SDValue bitcastToInt(SDValue Op, SelectionDAG &DAG) { - if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) - return DAG.getLoad(MVT::i32, Op.getDebugLoc(), - Ld->getChain(), Ld->getBasePtr(), - Ld->getSrcValue(), Ld->getSrcValueOffset(), - Ld->isVolatile(), Ld->isNonTemporal(), - Ld->getAlignment()); - - llvm_unreachable("Unknown VFP cmp argument!"); -} - /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. SDValue -ARMTargetLowering::getVFPCmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC, - SDValue &ARMCC, SelectionDAG &DAG, +ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, DebugLoc dl) const { - if (UnsafeFPMath && FiniteOnlyFPMath() && - (CC == ISD::SETEQ || CC == ISD::SETOEQ || - CC == ISD::SETNE || CC == ISD::SETUNE) && - canBitcastToInt(LHS.getNode()) && canBitcastToInt(RHS.getNode())) { - // If unsafe fp math optimization is enabled and there are no othter uses of - // the CMP operands, and the condition code is EQ oe NE, we can optimize it - // to an integer comparison. - if (CC == ISD::SETOEQ) - CC = ISD::SETEQ; - else if (CC == ISD::SETUNE) - CC = ISD::SETNE; - LHS = bitcastToInt(LHS, DAG); - RHS = bitcastToInt(RHS, DAG); - return getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl); - } - SDValue Cmp; if (!isFloatingPointZero(RHS)) Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Flag, LHS, RHS); @@ -2328,59 +2297,184 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); if (LHS.getValueType() == MVT::i32) { - SDValue ARMCC; + SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl); - return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, CCR,Cmp); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp); } ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); - SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32); + SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDValue Cmp = getVFPCmp(LHS, RHS, CC, ARMCC, DAG, dl); SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, - ARMCC, CCR, Cmp); + ARMcc, CCR, Cmp); if (CondCode2 != ARMCC::AL) { - SDValue ARMCC2 = DAG.getConstant(CondCode2, MVT::i32); + SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32); // FIXME: Needs another CMP because flag can have but one use. - SDValue Cmp2 = getVFPCmp(LHS, RHS, CC, ARMCC2, DAG, dl); + SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); Result = DAG.getNode(ARMISD::CMOV, dl, VT, - Result, TrueVal, ARMCC2, CCR, Cmp2); + Result, TrueVal, ARMcc2, CCR, Cmp2); } return Result; } +/// canChangeToInt - Given the fp compare operand, return true if it is suitable +/// to morph to an integer compare sequence. +static bool canChangeToInt(SDValue Op, bool &SeenZero, + const ARMSubtarget *Subtarget) { + SDNode *N = Op.getNode(); + if (!N->hasOneUse()) + // Otherwise it requires moving the value from fp to integer registers. + return false; + if (!N->getNumValues()) + return false; + EVT VT = Op.getValueType(); + if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) + // f32 case is generally profitable. f64 case only makes sense when vcmpe + + // vmrs are very slow, e.g. cortex-a8. + return false; + + if (isFloatingPointZero(Op)) { + SeenZero = true; + return true; + } + return ISD::isNormalLoad(N); +} + +static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { + if (isFloatingPointZero(Op)) + return DAG.getConstant(0, MVT::i32); + + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) + return DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), Ld->getBasePtr(), + Ld->getSrcValue(), Ld->getSrcValueOffset(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->getAlignment()); + + llvm_unreachable("Unknown VFP cmp argument!"); +} + +static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, + SDValue &RetVal1, SDValue &RetVal2) { + if (isFloatingPointZero(Op)) { + RetVal1 = DAG.getConstant(0, MVT::i32); + RetVal2 = DAG.getConstant(0, MVT::i32); + return; + } + + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { + SDValue Ptr = Ld->getBasePtr(); + RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), Ptr, + Ld->getSrcValue(), Ld->getSrcValueOffset(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->getAlignment()); + + EVT PtrType = Ptr.getValueType(); + unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); + SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(), + PtrType, Ptr, DAG.getConstant(4, PtrType)); + RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), NewPtr, + Ld->getSrcValue(), Ld->getSrcValueOffset() + 4, + Ld->isVolatile(), Ld->isNonTemporal(), + NewAlign); + return; + } + + llvm_unreachable("Unknown VFP cmp argument!"); +} + +/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some +/// f32 and even f64 comparisons to integer ones. +SDValue +ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + DebugLoc dl = Op.getDebugLoc(); + + bool SeenZero = false; + if (canChangeToInt(LHS, SeenZero, Subtarget) && + canChangeToInt(RHS, SeenZero, Subtarget) && + // If one of the operand is zero, it's safe to ignore the NaN case. + (FiniteOnlyFPMath() || SeenZero)) { + // If unsafe fp math optimization is enabled and there are no othter uses of + // the CMP operands, and the condition code is EQ oe NE, we can optimize it + // to an integer comparison. + if (CC == ISD::SETOEQ) + CC = ISD::SETEQ; + else if (CC == ISD::SETUNE) + CC = ISD::SETNE; + + SDValue ARMcc; + if (LHS.getValueType() == MVT::f32) { + LHS = bitcastf32Toi32(LHS, DAG); + RHS = bitcastf32Toi32(RHS, DAG); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, + Chain, Dest, ARMcc, CCR, Cmp); + } + + SDValue LHS1, LHS2; + SDValue RHS1, RHS2; + expandf64Toi32(LHS, DAG, LHS1, LHS2); + expandf64Toi32(RHS, DAG, RHS1, RHS2); + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + ARMcc = DAG.getConstant(CondCode, MVT::i32); + SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag); + SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; + return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7); + } + + return SDValue(); +} + SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); + SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); - SDValue LHS = Op.getOperand(2); - SDValue RHS = Op.getOperand(3); - SDValue Dest = Op.getOperand(4); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); DebugLoc dl = Op.getDebugLoc(); if (LHS.getValueType() == MVT::i32) { - SDValue ARMCC; + SDValue ARMcc; + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, - Chain, Dest, ARMCC, CCR,Cmp); + Chain, Dest, ARMcc, CCR, Cmp); } assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + + if (UnsafeFPMath && + (CC == ISD::SETEQ || CC == ISD::SETOEQ || + CC == ISD::SETNE || CC == ISD::SETUNE)) { + SDValue Result = OptimizeVFPBrcond(Op, DAG); + if (Result.getNode()) + return Result; + } + ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); - SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32); - SDValue Cmp = getVFPCmp(LHS, RHS, CC, ARMCC, DAG, dl); + SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Ops[] = { Chain, Dest, ARMCC, CCR, Cmp }; + SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); if (CondCode2 != ARMCC::AL) { - ARMCC = DAG.getConstant(CondCode2, MVT::i32); - SDValue Ops[] = { Res, Dest, ARMCC, CCR, Res.getValue(1) }; + ARMcc = DAG.getConstant(CondCode2, MVT::i32); + SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); } return Res; @@ -2469,12 +2563,11 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT SrcVT = Tmp1.getValueType(); SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0); - SDValue ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32); + SDValue ARMcc = DAG.getConstant(ARMCC::LT, MVT::i32); SDValue FP0 = DAG.getConstantFP(0.0, SrcVT); - SDValue Cmp = getVFPCmp(Tmp1, FP0, - ISD::SETLT, ARMCC, DAG, dl); + SDValue Cmp = getVFPCmp(Tmp1, FP0, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp); + return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMcc, CCR, Cmp); } SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ @@ -2553,51 +2646,18 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { } /// getZeroVector - Returns a vector of specified type with all zero elements. -/// +/// Zero vectors are used to represent vector negation and in those cases +/// will be implemented with the NEON VNEG instruction. However, VNEG does +/// not support i64 elements, so sometimes the zero vectors will need to be +/// explicitly constructed. Regardless, use a canonical VMOV to create the +/// zero vector. static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - - // Zero vectors are used to represent vector negation and in those cases - // will be implemented with the NEON VNEG instruction. However, VNEG does - // not support i64 elements, so sometimes the zero vectors will need to be - // explicitly constructed. For those cases, and potentially other uses in - // the future, always build zero vectors as <16 x i8> or <8 x i8> bitcasted - // to their dest type. This ensures they get CSE'd. - SDValue Vec; - SDValue Cst = DAG.getTargetConstant(0, MVT::i8); - SmallVector<SDValue, 8> Ops; - MVT TVT; - - if (VT.getSizeInBits() == 64) { - Ops.assign(8, Cst); TVT = MVT::v8i8; - } else { - Ops.assign(16, Cst); TVT = MVT::v16i8; - } - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size()); - - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); -} - -/// getOnesVector - Returns a vector of specified type with all bits set. -/// -static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { - assert(VT.isVector() && "Expected a vector type"); - - // Always build ones vectors as <16 x i8> or <8 x i8> bitcasted to their - // dest type. This ensures they get CSE'd. - SDValue Vec; - SDValue Cst = DAG.getTargetConstant(0xFF, MVT::i8); - SmallVector<SDValue, 8> Ops; - MVT TVT; - - if (VT.getSizeInBits() == 64) { - Ops.assign(8, Cst); TVT = MVT::v8i8; - } else { - Ops.assign(16, Cst); TVT = MVT::v16i8; - } - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size()); - - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); + // The canonical modified immediate encoding of a zero vector is....0! + SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32); + EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov); } /// LowerShiftRightParts - Lower SRA_PARTS, which returns two @@ -2611,7 +2671,7 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMCC; + SDValue ARMcc; unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); @@ -2627,9 +2687,9 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, - ARMCC, DAG, dl); + ARMcc, DAG, dl); SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); - SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, + SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp); SDValue Ops[2] = { Lo, Hi }; @@ -2647,7 +2707,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMCC; + SDValue ARMcc; assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, @@ -2661,9 +2721,9 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, - ARMCC, DAG, dl); + ARMcc, DAG, dl); SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); - SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMCC, + SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc, CCR, Cmp); SDValue Ops[2] = { Lo, Hi }; @@ -2850,13 +2910,11 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { /// isNEONModifiedImm - Check if the specified splat value corresponds to a /// valid vector constant for a NEON instruction with a "modified immediate" -/// operand (e.g., VMOV). If so, return either the constant being -/// splatted or the encoded value, depending on the DoEncode parameter. +/// operand (e.g., VMOV). If so, return the encoded value. static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, - bool isVMOV, bool DoEncode) { + EVT &VT, bool is128Bits, bool isVMOV) { unsigned OpCmode, Imm; - EVT VT; // SplatBitSize is set to the smallest size that splats the vector, so a // zero vector will always have SplatBitSize == 8. However, NEON modified @@ -2868,16 +2926,18 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, switch (SplatBitSize) { case 8: + if (!isVMOV) + return SDValue(); // Any 1-byte value is OK. Op=0, Cmode=1110. assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); OpCmode = 0xe; Imm = SplatBits; - VT = MVT::i8; + VT = is128Bits ? MVT::v16i8 : MVT::v8i8; break; case 16: // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. - VT = MVT::i16; + VT = is128Bits ? MVT::v8i16 : MVT::v4i16; if ((SplatBits & ~0xff) == 0) { // Value = 0x00nn: Op=x, Cmode=100x. OpCmode = 0x8; @@ -2897,7 +2957,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, // * only one byte is nonzero, or // * the least significant byte is 0xff and the second byte is nonzero, or // * the least significant 2 bytes are 0xff and the third is nonzero. - VT = MVT::i32; + VT = is128Bits ? MVT::v4i32 : MVT::v2i32; if ((SplatBits & ~0xff) == 0) { // Value = 0x000000nn: Op=x, Cmode=000x. OpCmode = 0; @@ -2949,9 +3009,9 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, return SDValue(); case 64: { - // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. if (!isVMOV) return SDValue(); + // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. uint64_t BitMask = 0xff; uint64_t Val = 0; unsigned ImmMask = 1; @@ -2969,7 +3029,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, // Op=1, Cmode=1110. OpCmode = 0x1e; SplatBits = Val; - VT = MVT::i64; + VT = is128Bits ? MVT::v2i64 : MVT::v1i64; break; } @@ -2978,32 +3038,8 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, return SDValue(); } - if (DoEncode) { - unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); - return DAG.getTargetConstant(EncodedVal, MVT::i32); - } - return DAG.getTargetConstant(SplatBits, VT); -} - -/// getNEONModImm - If this is a valid vector constant for a NEON instruction -/// with a "modified immediate" operand (e.g., VMOV) of the specified element -/// size, return the encoded value for that immediate. The ByteSize field -/// indicates the number of bytes of each element [1248]. -SDValue ARM::getNEONModImm(SDNode *N, unsigned ByteSize, bool isVMOV, - SelectionDAG &DAG) { - BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N); - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, - HasAnyUndefs, ByteSize * 8)) - return SDValue(); - - if (SplatBitSize > ByteSize * 8) - return SDValue(); - - return isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), - SplatBitSize, DAG, isVMOV, true); + unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); + return DAG.getTargetConstant(EncodedVal, MVT::i32); } static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT, @@ -3194,43 +3230,6 @@ static bool isVZIP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT, return true; } - -static SDValue BuildSplat(SDValue Val, EVT VT, SelectionDAG &DAG, DebugLoc dl) { - // Canonicalize all-zeros and all-ones vectors. - ConstantSDNode *ConstVal = cast<ConstantSDNode>(Val.getNode()); - if (ConstVal->isNullValue()) - return getZeroVector(VT, DAG, dl); - if (ConstVal->isAllOnesValue()) - return getOnesVector(VT, DAG, dl); - - EVT CanonicalVT; - if (VT.is64BitVector()) { - switch (Val.getValueType().getSizeInBits()) { - case 8: CanonicalVT = MVT::v8i8; break; - case 16: CanonicalVT = MVT::v4i16; break; - case 32: CanonicalVT = MVT::v2i32; break; - case 64: CanonicalVT = MVT::v1i64; break; - default: llvm_unreachable("unexpected splat element type"); break; - } - } else { - assert(VT.is128BitVector() && "unknown splat vector size"); - switch (Val.getValueType().getSizeInBits()) { - case 8: CanonicalVT = MVT::v16i8; break; - case 16: CanonicalVT = MVT::v8i16; break; - case 32: CanonicalVT = MVT::v4i32; break; - case 64: CanonicalVT = MVT::v2i64; break; - default: llvm_unreachable("unexpected splat element type"); break; - } - } - - // Build a canonical splat for this value. - SmallVector<SDValue, 8> Ops; - Ops.assign(CanonicalVT.getVectorNumElements(), Val); - SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, &Ops[0], - Ops.size()); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Res); -} - // If this is a case we can't handle, return null and let the default // expansion code take care of it. static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { @@ -3244,11 +3243,25 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { // Check if an immediate VMOV works. + EVT VmovVT; SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), - SplatUndef.getZExtValue(), - SplatBitSize, DAG, true, false); - if (Val.getNode()) - return BuildSplat(Val, VT, DAG, dl); + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VmovVT, VT.is128BitVector(), true); + if (Val.getNode()) { + SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov); + } + + // Try an immediate VMVN. + uint64_t NegatedImm = (SplatBits.getZExtValue() ^ + ((1LL << SplatBitSize) - 1)); + Val = isNEONModifiedImm(NegatedImm, + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VmovVT, VT.is128BitVector(), false); + if (Val.getNode()) { + SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov); + } } } @@ -3825,6 +3838,15 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, return BB; } +static +MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) + if (*I != Succ) + return *I; + llvm_unreachable("Expecting a BB with two successors!"); +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -3941,6 +3963,46 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return BB; } + case ARM::BCCi64: + case ARM::BCCZi64: { + // Compare both parts that make up the double comparison separately for + // equality. + bool RHSisZero = MI->getOpcode() == ARM::BCCZi64; + + unsigned LHS1 = MI->getOperand(1).getReg(); + unsigned LHS2 = MI->getOperand(2).getReg(); + if (RHSisZero) { + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(LHS1).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(LHS2).addImm(0) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); + } else { + unsigned RHS1 = MI->getOperand(3).getReg(); + unsigned RHS2 = MI->getOperand(4).getReg(); + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(LHS1).addReg(RHS1)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(LHS2).addReg(RHS2) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); + } + + MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB(); + MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); + if (MI->getOperand(0).getImm() == ARMCC::NE) + std::swap(destMBB, exitMBB); + + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2B : ARM::B)) + .addMBB(exitMBB); + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; + } + case ARM::tANDsp: case ARM::tADDspr_: case ARM::tSUBspi_: @@ -4180,6 +4242,35 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, return SDValue(); } +/// PerformVDUPLANECombine - Target-specific dag combine xforms for +/// ARMISD::VDUPLANE. +static SDValue PerformVDUPLANECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is + // redundant. + SDValue Op = N->getOperand(0); + EVT VT = N->getValueType(0); + + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BIT_CONVERT) + Op = Op.getOperand(0); + if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) + return SDValue(); + + // Make sure the VMOV element size is not bigger than the VDUPLANE elements. + unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits(); + // The canonical VMOV for a zero vector uses a 32-bit element size. + unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned EltBits; + if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) + EltSize = 8; + if (EltSize > VT.getVectorElementType().getSizeInBits()) + return SDValue(); + + SDValue Res = DCI.DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); + return DCI.CombineTo(N, Res, false); +} + /// getVShiftImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. @@ -4558,6 +4649,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI); + case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 3a38669..128b72e 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -53,6 +53,8 @@ namespace llvm { CMOV, // ARM conditional move instructions. CNEG, // ARM conditional negate instructions. + BCC_i64, + RBIT, // ARM bitreverse instruction FTOSI, // FP to sint within a FP register. @@ -122,6 +124,10 @@ namespace llvm { VGETLANEu, // zero-extend vector extract element VGETLANEs, // sign-extend vector extract element + // Vector move immediate and move negated immediate: + VMOVIMM, + VMVNIMM, + // Vector duplicate: VDUP, VDUPLANE, @@ -150,13 +156,6 @@ namespace llvm { /// Define some predicates that are used for node matching. namespace ARM { - /// getNEONModImm - If this is a valid vector constant for a NEON - /// instruction with a "modified immediate" operand (e.g., VMOV) of the - /// specified element size, return the encoded value for that immediate. - /// The ByteSize field indicates the number of bytes of each element [1248]. - SDValue getNEONModImm(SDNode *N, unsigned ByteSize, bool isVMOV, - SelectionDAG &DAG); - /// getVFPf32Imm / getVFPf64Imm - If the given fp immediate can be /// materialized with a VMOV.f32 / VMOV.f64 (i.e. fconsts / fconstd) /// instruction, returns its 8-bit integer representation. Otherwise, @@ -363,9 +362,11 @@ namespace llvm { DebugLoc dl, SelectionDAG &DAG) const; SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl) const; - SDValue getVFPCmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC, - SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl) const; + SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const; + SDValue getVFPCmp(SDValue LHS, SDValue RHS, + SelectionDAG &DAG, DebugLoc dl) const; + + SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const; MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB, diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index c73e204..51fc152 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -38,6 +38,12 @@ def SDT_ARMBr2JT : SDTypeProfile<0, 4, [SDTCisPtrTy<0>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def SDT_ARMBCC_i64 : SDTypeProfile<0, 6, + [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, SDTCisVT<2, i32>, + SDTCisVT<3, i32>, SDTCisVT<4, i32>, + SDTCisVT<5, OtherVT>]>; + def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, @@ -90,6 +96,9 @@ def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT, def ARMbr2jt : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT, [SDNPHasChain]>; +def ARMBcci64 : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64, + [SDNPHasChain]>; + def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp, [SDNPOutFlag]>; @@ -1685,13 +1694,19 @@ def RSCSrs : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), } // (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +// The assume-no-carry-in form uses the negation of the input since add/sub +// assume opposite meanings of the carry flag (i.e., carry == !borrow). +// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory +// details. def : ARMPat<(add GPR:$src, so_imm_neg:$imm), (SUBri GPR:$src, so_imm_neg:$imm)>; - -//def : ARMPat<(addc GPR:$src, so_imm_neg:$imm), -// (SUBSri GPR:$src, so_imm_neg:$imm)>; -//def : ARMPat<(adde GPR:$src, so_imm_neg:$imm), -// (SBCri GPR:$src, so_imm_neg:$imm)>; +def : ARMPat<(addc GPR:$src, so_imm_neg:$imm), + (SUBSri GPR:$src, so_imm_neg:$imm)>; +// The with-carry-in form matches bitwise not instead of the negation. +// Effectively, the inverse interpretation of the carry flag already accounts +// for part of the negation. +def : ARMPat<(adde GPR:$src, so_imm_not:$imm), + (SBCri GPR:$src, so_imm_not:$imm)>; // Note: These are implemented in C++ code, because they have to generate // ADD/SUBrs instructions, which use a complex pattern that a xform function @@ -2279,6 +2294,22 @@ defm CMNz : AI1_cmp_irs<0b1011, "cmn", def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm), (CMNzri GPR:$src, so_imm_neg:$imm)>; +// Pseudo i64 compares for some floating point compares. +let usesCustomInserter = 1, isBranch = 1, isTerminator = 1, + Defs = [CPSR] in { +def BCCi64 : PseudoInst<(outs), + (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst), + IIC_Br, + "${:comment} B\t$dst GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, imm:$cc", + [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>; + +def BCCZi64 : PseudoInst<(outs), + (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), + IIC_Br, + "${:comment} B\t$dst GPR:$lhs1, GPR:$lhs2, 0, 0, imm:$cc", + [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>; +} // usesCustomInserter + // Conditional moves // FIXME: should be able to write a pattern for ARMcmov, but can't use diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index a84315f..7f7eb98 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -65,6 +65,10 @@ def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; +def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; +def NEONvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>; +def NEONvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; + def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; // VDUPLANE can produce a quad-register result from a double-register source, @@ -94,6 +98,20 @@ def SDTARMFMAX : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>, def NEONfmax : SDNode<"ARMISD::FMAX", SDTARMFMAX>; def NEONfmin : SDNode<"ARMISD::FMIN", SDTARMFMAX>; +def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{ + ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); + unsigned EltBits; + uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + return (EltBits == 32 && EltVal == 0); +}]>; + +def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{ + ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); + unsigned EltBits; + uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + return (EltBits == 8 && EltVal == 0xff); +}]>; + //===----------------------------------------------------------------------===// // NEON operand definitions //===----------------------------------------------------------------------===// @@ -2318,10 +2336,10 @@ defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, // Vector Bitwise Operations. -def vnot8 : PatFrag<(ops node:$in), - (xor node:$in, (bitconvert (v8i8 immAllOnesV)))>; -def vnot16 : PatFrag<(ops node:$in), - (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>; +def vnotd : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>; +def vnotq : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>; // VAND : Vector Bitwise AND @@ -2347,36 +2365,58 @@ def VBICd : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, "vbic", "$dst, $src1, $src2", "", [(set DPR:$dst, (v2i32 (and DPR:$src1, - (vnot8 DPR:$src2))))]>; + (vnotd DPR:$src2))))]>; def VBICq : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, "vbic", "$dst, $src1, $src2", "", [(set QPR:$dst, (v4i32 (and QPR:$src1, - (vnot16 QPR:$src2))))]>; + (vnotq QPR:$src2))))]>; // VORN : Vector Bitwise OR NOT def VORNd : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, "vorn", "$dst, $src1, $src2", "", [(set DPR:$dst, (v2i32 (or DPR:$src1, - (vnot8 DPR:$src2))))]>; + (vnotd DPR:$src2))))]>; def VORNq : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, "vorn", "$dst, $src1, $src2", "", [(set QPR:$dst, (v4i32 (or QPR:$src1, - (vnot16 QPR:$src2))))]>; + (vnotq QPR:$src2))))]>; + +// VMVN : Vector Bitwise NOT (Immediate) + +let isReMaterializable = 1 in { +def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i16", "$dst, $SIMM", "", + [(set DPR:$dst, (v4i16 (NEONvmvnImm timm:$SIMM)))]>; +def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i16", "$dst, $SIMM", "", + [(set QPR:$dst, (v8i16 (NEONvmvnImm timm:$SIMM)))]>; + +def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i32", "$dst, $SIMM", "", + [(set DPR:$dst, (v2i32 (NEONvmvnImm timm:$SIMM)))]>; +def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i32", "$dst, $SIMM", "", + [(set QPR:$dst, (v4i32 (NEONvmvnImm timm:$SIMM)))]>; +} // VMVN : Vector Bitwise NOT def VMVNd : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0, (outs DPR:$dst), (ins DPR:$src), IIC_VSUBiD, "vmvn", "$dst, $src", "", - [(set DPR:$dst, (v2i32 (vnot8 DPR:$src)))]>; + [(set DPR:$dst, (v2i32 (vnotd DPR:$src)))]>; def VMVNq : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0, (outs QPR:$dst), (ins QPR:$src), IIC_VSUBiD, "vmvn", "$dst, $src", "", - [(set QPR:$dst, (v4i32 (vnot16 QPR:$src)))]>; -def : Pat<(v2i32 (vnot8 DPR:$src)), (VMVNd DPR:$src)>; -def : Pat<(v4i32 (vnot16 QPR:$src)), (VMVNq QPR:$src)>; + [(set QPR:$dst, (v4i32 (vnotq QPR:$src)))]>; +def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>; +def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>; // VBSL : Vector Bitwise Select def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), @@ -2385,14 +2425,14 @@ def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), "vbsl", "$dst, $src2, $src3", "$src1 = $dst", [(set DPR:$dst, (v2i32 (or (and DPR:$src2, DPR:$src1), - (and DPR:$src3, (vnot8 DPR:$src1)))))]>; + (and DPR:$src3, (vnotd DPR:$src1)))))]>; def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, IIC_VCNTiQ, "vbsl", "$dst, $src2, $src3", "$src1 = $dst", [(set QPR:$dst, (v4i32 (or (and QPR:$src2, QPR:$src1), - (and QPR:$src3, (vnot16 QPR:$src1)))))]>; + (and QPR:$src3, (vnotq QPR:$src1)))))]>; // VBIF : Vector Bitwise Insert if False // like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", @@ -2726,20 +2766,19 @@ defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, // Vector Negate. -def vneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>; -def vneg8 : PatFrag<(ops node:$in), - (sub (bitconvert (v8i8 immAllZerosV)), node:$in)>; -def vneg16 : PatFrag<(ops node:$in), - (sub (bitconvert (v16i8 immAllZerosV)), node:$in)>; +def vnegd : PatFrag<(ops node:$in), + (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>; +def vnegq : PatFrag<(ops node:$in), + (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>; class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (Ty (vneg8 DPR:$src)))]>; + [(set DPR:$dst, (Ty (vnegd DPR:$src)))]>; class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (Ty (vneg16 QPR:$src)))]>; + [(set QPR:$dst, (Ty (vnegq QPR:$src)))]>; // VNEG : Vector Negate (integer) def VNEGs8d : VNEGD<0b00, "vneg", "s8", v8i8>; @@ -2759,12 +2798,12 @@ def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, "vneg", "f32", "$dst, $src", "", [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>; -def : Pat<(v8i8 (vneg8 DPR:$src)), (VNEGs8d DPR:$src)>; -def : Pat<(v4i16 (vneg8 DPR:$src)), (VNEGs16d DPR:$src)>; -def : Pat<(v2i32 (vneg8 DPR:$src)), (VNEGs32d DPR:$src)>; -def : Pat<(v16i8 (vneg16 QPR:$src)), (VNEGs8q QPR:$src)>; -def : Pat<(v8i16 (vneg16 QPR:$src)), (VNEGs16q QPR:$src)>; -def : Pat<(v4i32 (vneg16 QPR:$src)), (VNEGs32q QPR:$src)>; +def : Pat<(v8i8 (vnegd DPR:$src)), (VNEGs8d DPR:$src)>; +def : Pat<(v4i16 (vnegd DPR:$src)), (VNEGs16d DPR:$src)>; +def : Pat<(v2i32 (vnegd DPR:$src)), (VNEGs32d DPR:$src)>; +def : Pat<(v16i8 (vnegq QPR:$src)), (VNEGs8q QPR:$src)>; +def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>; +def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>; // VQNEG : Vector Saturating Negate defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, @@ -2818,74 +2857,42 @@ def VMOVQQQQ : PseudoInst<(outs QQQQPR:$dst), (ins QQQQPR:$src), // VMOV : Vector Move (Immediate) -// VMOV_get_imm8 xform function: convert build_vector to VMOV.i8 imm. -def VMOV_get_imm8 : SDNodeXForm<build_vector, [{ - return ARM::getNEONModImm(N, 1, true, *CurDAG); -}]>; -def vmovImm8 : PatLeaf<(build_vector), [{ - return ARM::getNEONModImm(N, 1, true, *CurDAG).getNode() != 0; -}], VMOV_get_imm8>; - -// VMOV_get_imm16 xform function: convert build_vector to VMOV.i16 imm. -def VMOV_get_imm16 : SDNodeXForm<build_vector, [{ - return ARM::getNEONModImm(N, 2, true, *CurDAG); -}]>; -def vmovImm16 : PatLeaf<(build_vector), [{ - return ARM::getNEONModImm(N, 2, true, *CurDAG).getNode() != 0; -}], VMOV_get_imm16>; - -// VMOV_get_imm32 xform function: convert build_vector to VMOV.i32 imm. -def VMOV_get_imm32 : SDNodeXForm<build_vector, [{ - return ARM::getNEONModImm(N, 4, true, *CurDAG); -}]>; -def vmovImm32 : PatLeaf<(build_vector), [{ - return ARM::getNEONModImm(N, 4, true, *CurDAG).getNode() != 0; -}], VMOV_get_imm32>; - -// VMOV_get_imm64 xform function: convert build_vector to VMOV.i64 imm. -def VMOV_get_imm64 : SDNodeXForm<build_vector, [{ - return ARM::getNEONModImm(N, 8, true, *CurDAG); -}]>; -def vmovImm64 : PatLeaf<(build_vector), [{ - return ARM::getNEONModImm(N, 8, true, *CurDAG).getNode() != 0; -}], VMOV_get_imm64>; - let isReMaterializable = 1 in { def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst), (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i8", "$dst, $SIMM", "", - [(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>; + [(set DPR:$dst, (v8i8 (NEONvmovImm timm:$SIMM)))]>; def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst), (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i8", "$dst, $SIMM", "", - [(set QPR:$dst, (v16i8 vmovImm8:$SIMM))]>; + [(set QPR:$dst, (v16i8 (NEONvmovImm timm:$SIMM)))]>; def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$dst), (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i16", "$dst, $SIMM", "", - [(set DPR:$dst, (v4i16 vmovImm16:$SIMM))]>; + [(set DPR:$dst, (v4i16 (NEONvmovImm timm:$SIMM)))]>; def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$dst), (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i16", "$dst, $SIMM", "", - [(set QPR:$dst, (v8i16 vmovImm16:$SIMM))]>; + [(set QPR:$dst, (v8i16 (NEONvmovImm timm:$SIMM)))]>; -def VMOVv2i32 : N1ModImm<1, 0b000, {0,?,?,0}, 0, 0, 0, 1, (outs DPR:$dst), +def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$dst), (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i32", "$dst, $SIMM", "", - [(set DPR:$dst, (v2i32 vmovImm32:$SIMM))]>; -def VMOVv4i32 : N1ModImm<1, 0b000, {0,?,?,0}, 0, 1, 0, 1, (outs QPR:$dst), + [(set DPR:$dst, (v2i32 (NEONvmovImm timm:$SIMM)))]>; +def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$dst), (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i32", "$dst, $SIMM", "", - [(set QPR:$dst, (v4i32 vmovImm32:$SIMM))]>; + [(set QPR:$dst, (v4i32 (NEONvmovImm timm:$SIMM)))]>; def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst), (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i64", "$dst, $SIMM", "", - [(set DPR:$dst, (v1i64 vmovImm64:$SIMM))]>; + [(set DPR:$dst, (v1i64 (NEONvmovImm timm:$SIMM)))]>; def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst), (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i64", "$dst, $SIMM", "", - [(set QPR:$dst, (v2i64 vmovImm64:$SIMM))]>; + [(set QPR:$dst, (v2i64 (NEONvmovImm timm:$SIMM)))]>; } // isReMaterializable // VMOV : Vector Get Lane (move scalar to ARM core register) diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 4692f2a..bbe675e 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -122,6 +122,10 @@ def imm0_255_neg : PatLeaf<(i32 imm), [{ return (uint32_t)(-N->getZExtValue()) < 255; }], imm_neg_XFORM>; +def imm0_255_not : PatLeaf<(i32 imm), [{ + return (uint32_t)(~N->getZExtValue()) < 255; +}], imm_comp_XFORM>; + // Define Thumb2 specific addressing modes. // t2addrmode_imm12 := reg + imm12 @@ -1391,13 +1395,32 @@ defm t2RSBS : T2I_rbin_s_is <0b1110, "rsb", BinOpFrag<(subc node:$LHS, node:$RHS)>>; // (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +// The assume-no-carry-in form uses the negation of the input since add/sub +// assume opposite meanings of the carry flag (i.e., carry == !borrow). +// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory +// details. +// The AddedComplexity preferences the first variant over the others since +// it can be shrunk to a 16-bit wide encoding, while the others cannot. +let AddedComplexity = 1 in +def : T2Pat<(add GPR:$src, imm0_255_neg:$imm), + (t2SUBri GPR:$src, imm0_255_neg:$imm)>; +def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm), + (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>; +def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm), + (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>; +let AddedComplexity = 1 in +def : T2Pat<(addc GPR:$src, imm0_255_neg:$imm), + (t2SUBSri GPR:$src, imm0_255_neg:$imm)>; +def : T2Pat<(addc GPR:$src, t2_so_imm_neg:$imm), + (t2SUBSri GPR:$src, t2_so_imm_neg:$imm)>; +// The with-carry-in form matches bitwise not instead of the negation. +// Effectively, the inverse interpretation of the carry flag already accounts +// for part of the negation. let AddedComplexity = 1 in -def : T2Pat<(add GPR:$src, imm0_255_neg:$imm), - (t2SUBri GPR:$src, imm0_255_neg:$imm)>; -def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm), - (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>; -def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm), - (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>; +def : T2Pat<(adde GPR:$src, imm0_255_not:$imm), + (t2SBCSri GPR:$src, imm0_255_not:$imm)>; +def : T2Pat<(adde GPR:$src, t2_so_imm_not:$imm), + (t2SBCSri GPR:$src, t2_so_imm_not:$imm)>; // Select Bytes -- for disassembly only @@ -2435,7 +2458,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, hasExtraDefRegAllocReq = 1 in def t2LDM_RET : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), IIC_Br, - "ldm${addr:submode}${p}${addr:wide}\t$addr, $dsts", + "ldm${addr:submode}${p}${addr:wide}\t$addr!, $dsts", "$addr.addr = $wb", []> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b00; diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 8332bba..e7d92ed 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -54,6 +54,9 @@ protected: /// the VML[AS] instructions are slow (if so, don't use them). bool SlowVMLx; + /// SlowFPBrcc - True if floating point compare + branch is slow. + bool SlowFPBrcc; + /// IsThumb - True if we are in thumb mode, false if in ARM mode. bool IsThumb; @@ -133,6 +136,7 @@ protected: bool hasDivide() const { return HasHardwareDivide; } bool hasT2ExtractPack() const { return HasT2ExtractPack; } bool useVMLx() const {return hasVFP2() && !SlowVMLx; } + bool isFPBrccSlow() const { return SlowFPBrcc; } bool hasFP16() const { return HasFP16; } diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 8415d1a..4b08324 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -88,7 +88,7 @@ private: /// its register number, or -1 if there is no match. To allow return values /// to be used directly in register lists, arm registers have values between /// 0 and 15. - int MatchRegisterName(const StringRef &Name); + int MatchRegisterName(StringRef Name); /// } @@ -97,7 +97,7 @@ public: ARMAsmParser(const Target &T, MCAsmParser &_Parser) : TargetAsmParser(T), Parser(_Parser) {} - virtual bool ParseInstruction(const StringRef &Name, SMLoc NameLoc, + virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands); virtual bool ParseDirective(AsmToken DirectiveID); @@ -517,7 +517,7 @@ bool ARMAsmParser::ParseShift(ShiftType &St, const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) return true; - const StringRef &ShiftName = Tok.getString(); + StringRef ShiftName = Tok.getString(); if (ShiftName == "lsl" || ShiftName == "LSL") St = Lsl; else if (ShiftName == "lsr" || ShiftName == "LSR") @@ -549,7 +549,7 @@ bool ARMAsmParser::ParseShift(ShiftType &St, } /// A hack to allow some testing, to be replaced by a real table gen version. -int ARMAsmParser::MatchRegisterName(const StringRef &Name) { +int ARMAsmParser::MatchRegisterName(StringRef Name) { if (Name == "r0" || Name == "R0") return 0; else if (Name == "r1" || Name == "R1") @@ -593,7 +593,7 @@ MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCInst &Inst) { ARMOperand &Op0 = *(ARMOperand*)Operands[0]; assert(Op0.Kind == ARMOperand::Token && "First operand not a Token"); - const StringRef &Mnemonic = Op0.getToken(); + StringRef Mnemonic = Op0.getToken(); if (Mnemonic == "add" || Mnemonic == "stmfd" || Mnemonic == "str" || @@ -658,7 +658,7 @@ bool ARMAsmParser::ParseOperand(OwningPtr<ARMOperand> &Op) { } /// Parse an arm instruction mnemonic followed by its operands. -bool ARMAsmParser::ParseInstruction(const StringRef &Name, SMLoc NameLoc, +bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands) { OwningPtr<ARMOperand> Op; ARMOperand::CreateToken(Op, Name, NameLoc); @@ -761,7 +761,7 @@ bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) { const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) return Error(L, "unexpected token in .syntax directive"); - const StringRef &Mode = Tok.getString(); + StringRef Mode = Tok.getString(); if (Mode == "unified" || Mode == "UNIFIED") Parser.Lex(); else if (Mode == "divided" || Mode == "DIVIDED") diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp index 6a40cf3..946f474 100644 --- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp @@ -602,12 +602,8 @@ void ARMAsmPrinter::printAddrMode6Operand(const MachineInstr *MI, int Op, O << "[" << getRegisterName(MO1.getReg()); if (MO2.getImm()) { - unsigned Align = MO2.getImm(); - assert((Align == 8 || Align == 16 || Align == 32) && - "unexpected NEON load/store alignment"); - Align <<= 3; // FIXME: Both darwin as and GNU as violate ARM docs here. - O << ", :" << Align; + O << ", :" << (MO2.getImm() << 3); } O << "]"; } diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp index 170819a..edc9345 100644 --- a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp +++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp @@ -442,7 +442,7 @@ void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum, O << "[" << getRegisterName(MO1.getReg()); if (MO2.getImm()) { // FIXME: Both darwin as and GNU as violate ARM docs here. - O << ", :" << MO2.getImm(); + O << ", :" << (MO2.getImm() << 3); } O << "]"; } diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt index 85d5ca0..0cb8ff0 100644 --- a/lib/Target/ARM/README.txt +++ b/lib/Target/ARM/README.txt @@ -590,3 +590,70 @@ than the Z bit, we'll need additional logic to reverse the conditionals associated with the comparison. Perhaps a pseudo-instruction for the comparison, with a post-codegen pass to clean up and handle the condition codes? See PR5694 for testcase. + +//===---------------------------------------------------------------------===// + +Given the following on armv5: +int test1(int A, int B) { + return (A&-8388481)|(B&8388480); +} + +We currently generate: + ldr r2, .LCPI0_0 + and r0, r0, r2 + ldr r2, .LCPI0_1 + and r1, r1, r2 + orr r0, r1, r0 + bx lr + +We should be able to replace the second ldr+and with a bic (i.e. reuse the +constant which was already loaded). Not sure what's necessary to do that. + +//===---------------------------------------------------------------------===// + +Given the following on ARMv7: +int test1(int A, int B) { + return (A&-8388481)|(B&8388480); +} + +We currently generate: + bfc r0, #7, #16 + movw r2, #:lower16:8388480 + movt r2, #:upper16:8388480 + and r1, r1, r2 + orr r0, r1, r0 + bx lr + +The following is much shorter: + lsr r1, r1, #7 + bfi r0, r1, #7, #16 + bx lr + + +//===---------------------------------------------------------------------===// + +The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal: + +int a(int x) { return __builtin_bswap32(x); } + +a: + mov r1, #255, 24 + mov r2, #255, 16 + and r1, r1, r0, lsr #8 + and r2, r2, r0, lsl #8 + orr r1, r1, r0, lsr #24 + orr r0, r2, r0, lsl #24 + orr r0, r0, r1 + bx lr + +Something like the following would be better (fewer instructions/registers): + eor r1, r0, r0, ror #16 + bic r1, r1, #0xff0000 + mov r1, r1, lsr #8 + eor r0, r1, r0, ror #8 + bx lr + +A custom Thumb version would also be a slight improvement over the generic +version. + +//===---------------------------------------------------------------------===// |