summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSivarv <sivarv@microsoft.com>2016-11-23 10:36:46 -0800
committerGitHub <noreply@github.com>2016-11-23 10:36:46 -0800
commita8b6df020149787c82d832bb1ca817ec1ff57441 (patch)
treefafe665bb6c07c95b21bf71fd67899b871c1aa57
parent42aaac5bd8992db64d94e031026a3b4f4aaa2057 (diff)
parentb1643d1efb331a85e832d94f8128c75c588b1003 (diff)
downloadcoreclr-a8b6df020149787c82d832bb1ca817ec1ff57441.tar.gz
coreclr-a8b6df020149787c82d832bb1ca817ec1ff57441.tar.bz2
coreclr-a8b6df020149787c82d832bb1ca817ec1ff57441.zip
Merge pull request #8229 from mikedn/sse-eq
Change vector equality to use pmovmskb
-rw-r--r--src/jit/compiler.hpp15
-rw-r--r--src/jit/instrsxarch.h1
-rw-r--r--src/jit/lowerxarch.cpp6
-rw-r--r--src/jit/simdcodegenxarch.cpp77
4 files changed, 32 insertions, 67 deletions
diff --git a/src/jit/compiler.hpp b/src/jit/compiler.hpp
index 704cf18125..e8358fd2ab 100644
--- a/src/jit/compiler.hpp
+++ b/src/jit/compiler.hpp
@@ -473,10 +473,17 @@ inline unsigned Compiler::funGetFuncIdx(BasicBlock* block)
#endif // !FEATURE_EH_FUNCLETS
-/*****************************************************************************
- *
- * Map a register mask to a register number
- */
+//------------------------------------------------------------------------------
+// genRegNumFromMask : Maps a single register mask to a register number.
+//
+// Arguments:
+// mask - the register mask
+//
+// Return Value:
+// The number of the register contained in the mask.
+//
+// Assumptions:
+// The mask contains one and only one register.
inline regNumber genRegNumFromMask(regMaskTP mask)
{
diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h
index 4b32cd4240..4317334bf2 100644
--- a/src/jit/instrsxarch.h
+++ b/src/jit/instrsxarch.h
@@ -178,6 +178,7 @@ INST3(FIRST_SSE2_INSTRUCTION, "FIRST_SSE2_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CO
// These are the SSE instructions used on x86
INST3( mov_i2xmm, "movd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6E)) // Move int reg to a xmm reg. reg1=xmm reg, reg2=int reg
INST3( mov_xmm2i, "movd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x7E)) // Move xmm reg to an int reg. reg1=xmm reg, reg2=int reg
+INST3( pmovmskb, "pmovmskb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xD7)) // Move the MSB bits of all bytes in a xmm reg to an int reg
INST3( movq, "movq" , 0, IUM_WR, 0, 0, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E))
INST3( movsdsse2, "movsd" , 0, IUM_WR, 0, 0, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10))
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 2fee13f713..b9d2a21cb2 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -2754,14 +2754,14 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
else
{
- // Need two SIMD registers as scratch.
+ // Need one SIMD register as scratch.
// See genSIMDIntrinsicRelOp() for details on code sequence generate and
- // the need for two scratch registers.
+ // the need for one scratch register.
//
// Note these intrinsics produce a BOOL result, hence internal float
// registers reserved are guaranteed to be different from target
// integer register without explicitly specifying.
- info->internalFloatCount = 2;
+ info->internalFloatCount = 1;
info->setInternalCandidates(lsra, lsra->allSIMDRegs());
}
break;
diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp
index a55d344338..20db803443 100644
--- a/src/jit/simdcodegenxarch.cpp
+++ b/src/jit/simdcodegenxarch.cpp
@@ -1141,15 +1141,8 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
}
else
{
- // We need two additional XMM registers as scratch.
- regMaskTP floatRsvdRegs = (simdNode->gtRsvdRegs & RBM_ALLFLOAT);
- assert(floatRsvdRegs != RBM_NONE);
- assert(genCountBits(floatRsvdRegs) == 2);
-
- regMaskTP tmpRegMask = genFindLowestBit(floatRsvdRegs);
- floatRsvdRegs &= ~tmpRegMask;
- regNumber tmpReg1 = genRegNumFromMask(tmpRegMask);
- regNumber tmpReg2 = genRegNumFromMask(floatRsvdRegs);
+ // We need one additional SIMD register to store the result of the SIMD compare.
+ regNumber tmpReg1 = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLFLOAT);
// tmpReg1 = (op1Reg == op2Reg)
// Call this value of tmpReg1 as 'compResult' for further reference below.
@@ -1180,54 +1173,12 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
}
- // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
- if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
- {
- // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
- //
- // Generated code sequence
- // - vextractf128 tmpReg2, tmpReg1, 0x01
- // tmpReg2[128..255] <- 0
- // tmpReg2[0..127] <- tmpReg1[128..255]
- // - vandps tmpReg1, tempReg2
- // This will zero-out upper portion of tmpReg1 and
- // lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
- getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
- inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
- }
- // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
- if (simdType != TYP_SIMD8)
- {
- // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
- // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
- getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);
-
- // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
- //
- // Note that what we have computed is as follows at this point:
- // tmpReg1[0] = compResult[0] & compResult[2]
- // tmpReg1[1] = compResult[1] & compResult[3]
- inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
- }
- // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
- // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields.
-
- // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
- // tmpReg2[0] = compResult[1] & compResult[3]
- getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);
-
- // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
- // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
- inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??
-
regNumber intReg;
if (targetReg == REG_NA)
{
// If we are not materializing result into a register,
// we would have reserved an int type internal register.
- regMaskTP intRsvdRegs = (simdNode->gtRsvdRegs & RBM_ALLINT);
- assert(genCountBits(intRsvdRegs) == 1);
- intReg = genRegNumFromMask(intRsvdRegs);
+ intReg = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLINT);
}
else
{
@@ -1238,12 +1189,18 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
assert(genCountBits(simdNode->gtRsvdRegs & RBM_ALLINT) == 0);
}
- // intReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
- // (Note that for mov_xmm2i, the int register is always in the reg2 position.
- inst_RV_RV(INS_mov_xmm2i, tmpReg1, intReg, TYP_INT);
-
- // cmp intReg, 0xFFFFFFFF
- getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, 0xFFFFFFFF);
+ inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType));
+ // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare
+ // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a
+ // subset of each component's ones/zeroes. In the end we need to know if the result is
+ // "all ones" where the number of ones is given by the vector byte size, not by the
+ // vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and
+ // for SSE registers we need to compare to 0x0000FFFF.
+ // Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize
+ // that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF
+ // encoding instead of 83F8FF.
+ getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg,
+ emitActualTypeSize(simdType) == 32 ? -1 : 0x0000FFFF);
}
if (targetReg != REG_NA)
@@ -1251,12 +1208,12 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
// If we need to materialize result into a register, targetReg needs to
// be set to 1 on true and zero on false.
// Equality:
- // cmp targetReg, 0xFFFFFFFF
+ // cmp targetReg, 0xFFFFFFFF or 0xFFFF
// sete targetReg
// movzx targetReg, targetReg
//
// InEquality:
- // cmp targetReg, 0xFFFFFFFF
+ // cmp targetReg, 0xFFFFFFFF or 0xFFFF
// setne targetReg
// movzx targetReg, targetReg
//