index 90eea22f77bd5cec37513bb48167e37ec7dc4124..b0c6205ac69139de1aceab10d0d9e449027e5050 100644 (file)
setOperationAction(ISD::FLOG10, MVT::f80, Expand);
setOperationAction(ISD::FEXP, MVT::f80, Expand);
setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+ setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
+ setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
setOperationAction(ISD::UMULO, VT, Custom);
}
- // There are no 8-bit 3-address imul/mul instructions
- setOperationAction(ISD::SMULO, MVT::i8, Expand);
- setOperationAction(ISD::UMULO, MVT::i8, Expand);
if (!Subtarget->is64Bit()) {
// These libcalls are not available in 32-bit.
"a sorted mask where the broadcast "
"comes from V1.");
- // Check if this is a broadcast of a scalar. We special case lowering for
- // scalars so that we can more effectively fold with loads.
+ // Go up the chain of (vector) values to try and find a scalar load that
+ // we can combine with the broadcast.
+ for (;;) {
+ switch (V.getOpcode()) {
+ case ISD::CONCAT_VECTORS: {
+ int OperandSize = Mask.size() / V.getNumOperands();
+ V = V.getOperand(BroadcastIdx / OperandSize);
+ BroadcastIdx %= OperandSize;
+ continue;
+ }
+
+ case ISD::INSERT_SUBVECTOR: {
+ SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
+ auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
+ if (!ConstantIdx)
+ break;
+
+ int BeginIdx = (int)ConstantIdx->getZExtValue();
+ int EndIdx =
+ BeginIdx + (int)VInner.getValueType().getVectorNumElements();
+ if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
+ BroadcastIdx -= BeginIdx;
+ V = VInner;
+ } else {
+ V = VOuter;
+ }
+ continue;
+ }
+ }
+ break;
+ }
+
+ // Check if this is a broadcast of a scalar. We special case lowering
+ // for scalars so that we can more effectively fold with loads.
if (V.getOpcode() == ISD::BUILD_VECTOR ||
- (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+ (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
V = V.getOperand(BroadcastIdx);
// If the scalar isn't a load we can't broadcast from it in AVX1, only with
// Try to use rotation instructions if available.
if (Subtarget->hasSSSE3())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i16, V1, V2, Mask, DAG))
return Rotate;
if (NumV1Inputs + NumV2Inputs <= 4)
// Try to use rotation instructions if available.
if (Subtarget->hasSSSE3())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v16i8, V1, V2,
- OrigMask, DAG))
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i8, V1, V2, OrigMask, DAG))
return Rotate;
// Try to use a zext lowering.
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- assert(Subtarget->hasDQI() && "We can only lower v8i64 with AVX-512-DQI");
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- assert(Subtarget->hasDQI() && "We can only lower v16i32 with AVX-512-DQI!");
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
@@ -10267,6 +10297,11 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Subtarget->hasAVX512() &&
"Cannot lower 512-bit vectors w/ basic ISA!");
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// Dispatch to each element type for lowering. If we don't have supprot for
// specific element type shuffles at 512 bits, immediately split them and
// lower them. Each lowering routine of a given type is allowed to assume that
@@ -10277,13 +10312,9 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
case MVT::v16f32:
return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v8i64:
- if (Subtarget->hasDQI())
- return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
- break;
+ return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v16i32:
- if (Subtarget->hasDQI())
- return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
- break;
+ return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v32i16:
if (Subtarget->hasBWI())
return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const {
+ // FIXME: We should use instruction latency models to calculate the cost of
+ // each potential sequence, but this is very hard to do reliably because
+ // at least Intel's Core* chips have variable timing based on the number of
+ // significant digits in the divisor and/or sqrt operand.
+ if (!Subtarget->useSqrtEst())
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+
+ // SSE1 has rsqrtss and rsqrtps.
+ // TODO: Add support for AVX512 (v16f32).
+ // It is likely not profitable to do this for f64 because a double-precision
+ // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+ // instructions: convert to single, rsqrtss, convert back to double, refine
+ // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+ // along with FMA, this could be a throughput win.
+ if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+ (Subtarget->hasAVX() && VT == MVT::v8f32)) {
+ RefinementSteps = 1;
+ UseOneConstNR = false;
+ return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
+}
+
static bool isAllOnes(SDValue V) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
return C && C->isAllOnesValue();
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
-/// \brief Return (and \p Op, \p Mask) for compare instructions or
-/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
+/// \brief Return (vselect \p Mask, \p Op, \p PreservedSrc) along with the
/// necessary casting for \p Mask when lowering masking intrinsics.
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc, SelectionDAG &DAG) {
SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
DAG.getIntPtrConstant(0));
-
- switch (Op.getOpcode()) {
- default: break;
- case X86ISD::PCMPEQM:
- case X86ISD::PCMPGTM:
- case X86ISD::CMPM:
- case X86ISD::CMPMU:
- return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
- }
-
return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
}
// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
// (i8 (bitcast
// (v8i1 (insert_subvector undef,
- // (v2i1 (and (PCMPEQM %a, %b),
- // (extract_subvector
- // (v8i1 (bitcast %mask)), 0))), 0))))
+ // (v2i1 (vselect (extract_subvector
+ // (v8i1 (bitcast %mask)), 0),
+ // (PCMPEQM %a, %b), 0))))))
EVT VT = Op.getOperand(1).getValueType();
EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
VT.getVectorNumElements());
Cond = X86::COND_B;
break;
case ISD::SMULO:
- BaseOp = X86ISD::SMUL;
+ BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
Cond = X86::COND_O;
break;
case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+ if (N->getValueType(0) == MVT::i8) {
+ BaseOp = X86ISD::UMUL8;
+ Cond = X86::COND_O;
+ break;
+ }
SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
MVT::i32);
SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
/// specific shuffle of a load can be folded into a single element load.
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
-/// shuffles have been customed lowered so we need to handle those here.
+/// shuffles have been custom lowered so we need to handle those here.
static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
if (DCI.isBeforeLegalizeOps())
@@ -21720,18 +21776,20 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
if (!isa<ConstantSDNode>(EltNo))
return SDValue();
- EVT VT = InVec.getValueType();
+ EVT OriginalVT = InVec.getValueType();
if (InVec.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
if (!InVec.hasOneUse())
return SDValue();
EVT BCVT = InVec.getOperand(0).getValueType();
- if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
+ if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
return SDValue();
InVec = InVec.getOperand(0);
}
+ EVT CurrentVT = InVec.getValueType();
+
if (!isTargetShuffle(InVec.getOpcode()))
return SDValue();
@@ -21741,12 +21799,12 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
SmallVector<int, 16> ShuffleMask;
bool UnaryShuffle;
- if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
- UnaryShuffle))
+ if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
+ ShuffleMask, UnaryShuffle))
return SDValue();
// Select the input vector, guarding against out of range extract vector.
- unsigned NumElems = VT.getVectorNumElements();
+ unsigned NumElems = CurrentVT.getVectorNumElements();
int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
@@ -21788,11 +21846,12 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
SDLoc dl(N);
// Create shuffle node taking into account the case that its a unary shuffle
- SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
- Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
+ SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
+ : InVec.getOperand(1);
+ Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
InVec.getOperand(0), Shuffle,
&ShuffleMask[0]);
- Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+ Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
EltNo);
}
TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
DCI.isBeforeLegalizeOps());
if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
- TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
+ (TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
+ TLO) &&
+ // Don't optimize vector of constants. Those are handled by
+ // the generic code and all the bits must be properly set for
+ // the generic optimizer.
+ !ISD::isBuildVectorOfConstantSDNodes(TLO.New.getNode())))
DCI.CommitTargetLoweringOpt(TLO);
}