diff options
author | Sivarv <sivarv@microsoft.com> | 2016-11-23 10:36:46 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-11-23 10:36:46 -0800 |
commit | a8b6df020149787c82d832bb1ca817ec1ff57441 (patch) | |
tree | fafe665bb6c07c95b21bf71fd67899b871c1aa57 | |
parent | 42aaac5bd8992db64d94e031026a3b4f4aaa2057 (diff) | |
parent | b1643d1efb331a85e832d94f8128c75c588b1003 (diff) | |
download | coreclr-a8b6df020149787c82d832bb1ca817ec1ff57441.tar.gz coreclr-a8b6df020149787c82d832bb1ca817ec1ff57441.tar.bz2 coreclr-a8b6df020149787c82d832bb1ca817ec1ff57441.zip |
Merge pull request #8229 from mikedn/sse-eq
Change vector equality to use pmovmskb
-rw-r--r-- | src/jit/compiler.hpp | 15 | ||||
-rw-r--r-- | src/jit/instrsxarch.h | 1 | ||||
-rw-r--r-- | src/jit/lowerxarch.cpp | 6 | ||||
-rw-r--r-- | src/jit/simdcodegenxarch.cpp | 77 |
4 files changed, 32 insertions, 67 deletions
diff --git a/src/jit/compiler.hpp b/src/jit/compiler.hpp index 704cf18125..e8358fd2ab 100644 --- a/src/jit/compiler.hpp +++ b/src/jit/compiler.hpp @@ -473,10 +473,17 @@ inline unsigned Compiler::funGetFuncIdx(BasicBlock* block) #endif // !FEATURE_EH_FUNCLETS -/***************************************************************************** - * - * Map a register mask to a register number - */ +//------------------------------------------------------------------------------ +// genRegNumFromMask : Maps a single register mask to a register number. +// +// Arguments: +// mask - the register mask +// +// Return Value: +// The number of the register contained in the mask. +// +// Assumptions: +// The mask contains one and only one register. inline regNumber genRegNumFromMask(regMaskTP mask) { diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index 4b32cd4240..4317334bf2 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -178,6 +178,7 @@ INST3(FIRST_SSE2_INSTRUCTION, "FIRST_SSE2_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CO // These are the SSE instructions used on x86 INST3( mov_i2xmm, "movd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6E)) // Move int reg to a xmm reg. reg1=xmm reg, reg2=int reg INST3( mov_xmm2i, "movd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x7E)) // Move xmm reg to an int reg. reg1=xmm reg, reg2=int reg +INST3( pmovmskb, "pmovmskb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xD7)) // Move the MSB bits of all bytes in a xmm reg to an int reg INST3( movq, "movq" , 0, IUM_WR, 0, 0, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E)) INST3( movsdsse2, "movsd" , 0, IUM_WR, 0, 0, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10)) diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index 2fee13f713..b9d2a21cb2 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2754,14 +2754,14 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) else { - // Need two SIMD registers as scratch. + // Need one SIMD register as scratch. // See genSIMDIntrinsicRelOp() for details on code sequence generate and - // the need for two scratch registers. + // the need for one scratch register. // // Note these intrinsics produce a BOOL result, hence internal float // registers reserved are guaranteed to be different from target // integer register without explicitly specifying. - info->internalFloatCount = 2; + info->internalFloatCount = 1; info->setInternalCandidates(lsra, lsra->allSIMDRegs()); } break; diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp index a55d344338..20db803443 100644 --- a/src/jit/simdcodegenxarch.cpp +++ b/src/jit/simdcodegenxarch.cpp @@ -1141,15 +1141,8 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) } else { - // We need two additional XMM registers as scratch. - regMaskTP floatRsvdRegs = (simdNode->gtRsvdRegs & RBM_ALLFLOAT); - assert(floatRsvdRegs != RBM_NONE); - assert(genCountBits(floatRsvdRegs) == 2); - - regMaskTP tmpRegMask = genFindLowestBit(floatRsvdRegs); - floatRsvdRegs &= ~tmpRegMask; - regNumber tmpReg1 = genRegNumFromMask(tmpRegMask); - regNumber tmpReg2 = genRegNumFromMask(floatRsvdRegs); + // We need one additional SIMD register to store the result of the SIMD compare. + regNumber tmpReg1 = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLFLOAT); // tmpReg1 = (op1Reg == op2Reg) // Call this value of tmpReg1 as 'compResult' for further reference below. @@ -1180,54 +1173,12 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType)); } - // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result. - if (compiler->canUseAVX() && (simdType == TYP_SIMD32)) - { - // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits - // - // Generated code sequence - // - vextractf128 tmpReg2, tmpReg1, 0x01 - // tmpReg2[128..255] <- 0 - // tmpReg2[0..127] <- tmpReg1[128..255] - // - vandps tmpReg1, tempReg2 - // This will zero-out upper portion of tmpReg1 and - // lower portion of tmpReg1 is and of upper and lower 128-bit comparison result. - getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01); - inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); - } - // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result. - if (simdType != TYP_SIMD8) - { - // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2)) - // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE - getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E); - - // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2) - // - // Note that what we have computed is as follows at this point: - // tmpReg1[0] = compResult[0] & compResult[2] - // tmpReg1[1] = compResult[1] & compResult[3] - inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); - } - // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1], - // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields. - - // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1)) - // tmpReg2[0] = compResult[1] & compResult[3] - getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1); - - // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2) - // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3] - inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps?? - regNumber intReg; if (targetReg == REG_NA) { // If we are not materializing result into a register, // we would have reserved an int type internal register. - regMaskTP intRsvdRegs = (simdNode->gtRsvdRegs & RBM_ALLINT); - assert(genCountBits(intRsvdRegs) == 1); - intReg = genRegNumFromMask(intRsvdRegs); + intReg = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLINT); } else { @@ -1238,12 +1189,18 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) assert(genCountBits(simdNode->gtRsvdRegs & RBM_ALLINT) == 0); } - // intReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3] - // (Note that for mov_xmm2i, the int register is always in the reg2 position. - inst_RV_RV(INS_mov_xmm2i, tmpReg1, intReg, TYP_INT); - - // cmp intReg, 0xFFFFFFFF - getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, 0xFFFFFFFF); + inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType)); + // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare + // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a + // subset of each component's ones/zeroes. In the end we need to know if the result is + // "all ones" where the number of ones is given by the vector byte size, not by the + // vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and + // for SSE registers we need to compare to 0x0000FFFF. + // Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize + // that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF + // encoding instead of 83F8FF. + getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, + emitActualTypeSize(simdType) == 32 ? -1 : 0x0000FFFF); } if (targetReg != REG_NA) @@ -1251,12 +1208,12 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) // If we need to materialize result into a register, targetReg needs to // be set to 1 on true and zero on false. // Equality: - // cmp targetReg, 0xFFFFFFFF + // cmp targetReg, 0xFFFFFFFF or 0xFFFF // sete targetReg // movzx targetReg, targetReg // // InEquality: - // cmp targetReg, 0xFFFFFFFF + // cmp targetReg, 0xFFFFFFFF or 0xFFFF // setne targetReg // movzx targetReg, targetReg // |