summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 38e1816)
raw | patch | inline | side by side (parent: 38e1816)
author | Chandler Carruth <chandlerc@gmail.com> | |
Sun, 21 Sep 2014 13:35:14 +0000 (13:35 +0000) | ||
committer | Chandler Carruth <chandlerc@gmail.com> | |
Sun, 21 Sep 2014 13:35:14 +0000 (13:35 +0000) |
lowering when it can use a symmetric SHUFPS across both 128-bit lanes.
This required making the SHUFPS lowering tolerant of other vector types,
and adjusting our canonicalization to canonicalize harder.
This is the last of the clever uses of symmetry I've thought of for
v8f32. The rest of the tricks I'm aware of here are to work around
assymetry in the mask.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218216 91177308-0d34-0410-b5e6-96231b3b80d8
This required making the SHUFPS lowering tolerant of other vector types,
and adjusting our canonicalization to canonicalize harder.
This is the last of the clever uses of symmetry I've thought of for
v8f32. The rest of the tricks I'm aware of here are to work around
assymetry in the mask.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218216 91177308-0d34-0410-b5e6-96231b3b80d8
lib/Target/X86/X86ISelLowering.cpp | patch | blob | history | |
test/CodeGen/X86/vector-shuffle-256-v8.ll | patch | blob | history |
index 14613d3c6e1caf9d64dcc032c539cb78d66d5204..2a020b51001ec475e13aa56df9ca86165510719b 100644 (file)
// To make this work, blend them together as the first step.
int V1Index = V2AdjIndex;
int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
- V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
+ V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
getV4X86ShuffleImm8ForMask(BlendMask, DAG));
// Now proceed to reconstruct the final blend as we have the necessary
Mask[2] < 4 ? Mask[2] : Mask[3],
(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
- V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
+ V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
getV4X86ShuffleImm8ForMask(BlendMask, DAG));
// Now we do a normal shuffle of V1 by giving V1 as both operands to
NewMask[3] = Mask[2] < 4 ? 3 : 1;
}
}
- return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
getV4X86ShuffleImm8ForMask(NewMask, DAG));
}
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
if (isShuffleEquivalent(LoMask, 2, 10, 3, 11))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
+
+ // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
+ // have already handled any direct blends.
+ int SHUFPSMask[] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+ for (int &M : SHUFPSMask)
+ if (M >= 8)
+ M -= 4;
+ return lowerVectorShuffleWithSHUPFS(DL, MVT::v8f32, SHUFPSMask, V1, V2, DAG);
}
if (isSingleInputShuffleMask(Mask))
return DAG.getCommutedVectorShuffle(*SVOp);
// When the number of V1 and V2 elements are the same, try to minimize the
- // number of uses of V2 in the low half of the vector.
+ // number of uses of V2 in the low half of the vector. When that is tied,
+ // ensure that the sum of indices for V1 is equal to or lower than the sum
+ // indices for V2.
if (NumV1Elements == NumV2Elements) {
int LowV1Elements = 0, LowV2Elements = 0;
for (int M : SVOp->getMask().slice(0, NumElements / 2))
++LowV1Elements;
if (LowV2Elements > LowV1Elements)
return DAG.getCommutedVectorShuffle(*SVOp);
+
+ int SumV1Indices = 0, SumV2Indices = 0;
+ for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+ if (SVOp->getMask()[i] >= NumElements)
+ SumV2Indices += i;
+ else if (SVOp->getMask()[i] >= 0)
+ SumV1Indices += i;
+ if (SumV2Indices < SumV1Indices)
+ return DAG.getCommutedVectorShuffle(*SVOp);
}
// For each vector width, delegate to a specialized lowering routine.
index 2d4f15769bb6ae4ddbe34f3ff347c7a9cf8c2143..311fe7e4734e502b1da98a8802996e5c52eeb387 100644 (file)
define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_08084c4c
; ALL: # BB#0:
-; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm1[0,0,2,0,4,4,6,4]
-; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,1,0,3,4,5,4,7]
-; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; ALL-NEXT: vshufps {{.*}} # ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; ALL-NEXT: vshufps {{.*}} # ymm0 = ymm0[0,2,1,3,4,6,5,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_8823cc67
; ALL: # BB#0:
-; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm1[0,0,2,3,4,4,6,7]
-; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT: vshufps {{.*}} # ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_9832dc76
; ALL: # BB#0:
-; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm1[1,0,2,3,5,4,6,7]
-; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,1,3,2,4,5,7,6]
-; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT: vshufps {{.*}} # ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: @shuffle_v8f32_9810dc54
; ALL: # BB#0:
-; ALL-NEXT: vpermilps {{.*}} # ymm1 = ymm1[1,0,2,3,5,4,6,7]
-; ALL-NEXT: vpermilps {{.*}} # ymm0 = ymm0[0,1,1,0,4,5,5,4]
-; ALL-NEXT: vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT: vshufps {{.*}} # ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
ret <8 x float> %shuffle