diff options
author | Fei Peng <fei.peng@intel.com> | 2017-02-13 16:07:56 -0800 |
---|---|---|
committer | Fei Peng <fei.peng@intel.com> | 2017-02-13 17:43:44 -0800 |
commit | e6fe29c6c6ca6e4ebc47e0556dfc196055e8dae7 (patch) | |
tree | 418fd6406306e4f75b17b722541028490f0102b5 | |
parent | 539b142c53c66702fd9e76335f591dd946ddbdd5 (diff) | |
download | coreclr-e6fe29c6c6ca6e4ebc47e0556dfc196055e8dae7.tar.gz coreclr-e6fe29c6c6ca6e4ebc47e0556dfc196055e8dae7.tar.bz2 coreclr-e6fe29c6c6ca6e4ebc47e0556dfc196055e8dae7.zip |
Directly support Min/Max intrinsic for Vector<T> on SSE3_4 and above targets
-rw-r--r-- | src/jit/emitxarch.cpp | 16 | ||||
-rw-r--r-- | src/jit/instrsxarch.h | 8 | ||||
-rw-r--r-- | src/jit/simd.cpp | 15 | ||||
-rw-r--r-- | src/jit/simdcodegenxarch.cpp | 38 |
4 files changed, 67 insertions, 10 deletions
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index be5cefbfea..796af19a92 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -81,9 +81,11 @@ bool emitter::IsThreeOperandBinaryAVXInstruction(instruction ins) ins == INS_minsd || ins == INS_divps || ins == INS_divpd || ins == INS_maxps || ins == INS_maxpd || ins == INS_maxss || ins == INS_maxsd || ins == INS_andnps || ins == INS_andnpd || ins == INS_paddb || ins == INS_paddw || ins == INS_paddd || ins == INS_paddq || ins == INS_psubb || ins == INS_psubw || - ins == INS_psubd || ins == INS_psubq || ins == INS_pmuludq || ins == INS_pxor || ins == INS_pmaxub || - ins == INS_pminub || ins == INS_pmaxsw || ins == INS_pminsw || ins == INS_insertps || - ins == INS_vinsertf128 || ins == INS_punpckldq || ins == INS_phaddd); + ins == INS_psubd || ins == INS_psubq || ins == INS_pmuludq || ins == INS_pxor || ins == INS_insertps || + ins == INS_vinsertf128 || ins == INS_punpckldq || ins == INS_phaddd || ins == INS_pminub || + ins == INS_pminsw || ins == INS_pminsb || ins == INS_pminsd || ins == INS_pminuw || ins == INS_pminud || + ins == INS_pmaxub || ins == INS_pmaxsw || ins == INS_pmaxsb || ins == INS_pmaxsd || ins == INS_pmaxuw || + ins == INS_pmaxud); } // Returns true if the AVX instruction is a move operator that requires 3 operands. @@ -115,7 +117,9 @@ bool emitter::Is4ByteAVXInstruction(instruction ins) (ins == INS_dpps || ins == INS_dppd || ins == INS_insertps || ins == INS_pcmpeqq || ins == INS_pcmpgtq || ins == INS_vbroadcastss || ins == INS_vbroadcastsd || ins == INS_vpbroadcastb || ins == INS_vpbroadcastw || ins == INS_vpbroadcastd || ins == INS_vpbroadcastq || ins == INS_vextractf128 || ins == INS_vinsertf128 || - ins == INS_pmulld || ins == INS_ptest || ins == INS_phaddd); + ins == INS_pmulld || ins == INS_ptest || ins == INS_phaddd || ins == INS_pminsb || ins == INS_pminsd || + ins == INS_pminuw || ins == INS_pminud || ins == INS_pmaxsb || ins == INS_pmaxsd || ins == INS_pmaxuw || + ins == INS_pmaxud); } #endif // FEATURE_AVX_SUPPORT @@ -135,7 +139,9 @@ bool emitter::Is4ByteSSE4Instruction(instruction ins) return false; #else return UseSSE3_4() && (ins == INS_dpps || ins == INS_dppd || ins == INS_insertps || ins == INS_pcmpeqq || - ins == INS_pcmpgtq || ins == INS_pmulld || ins == INS_ptest || ins == INS_phaddd); + ins == INS_pcmpgtq || ins == INS_pmulld || ins == INS_ptest || ins == INS_phaddd || + ins == INS_pminsb || ins == INS_pminsd || ins == INS_pminuw || ins == INS_pminud || + ins == INS_pmaxsb || ins == INS_pmaxsd || ins == INS_pmaxuw || ins == INS_pmaxud); #endif } diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index 8ab3a845ba..70a7243866 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -323,6 +323,14 @@ INST3( phaddd, "phaddd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( pabsb, "pabsb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x1C)) // Packed absolute value of bytes INST3( pabsw, "pabsw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x1D)) // Packed absolute value of 16-bit integers INST3( pabsd, "pabsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x1E)) // Packed absolute value of 32-bit integers +INST3( pminsb, "pminsb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x38)) // packed minimum signed bytes +INST3( pminsd, "pminsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x39)) // packed minimum 32-bit signed integers +INST3( pminuw, "pminuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3A)) // packed minimum 16-bit unsigned integers +INST3( pminud, "pminud" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3B)) // packed minimum 32-bit unsigned integers +INST3( pmaxsb, "pmaxsb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3C)) // packed maximum signed bytes +INST3( pmaxsd, "pmaxsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3D)) // packed maximum 32-bit signed integers +INST3( pmaxuw, "pmaxuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3E)) // packed maximum 16-bit unsigned integers +INST3( pmaxud, "pmaxud" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3F)) // packed maximum 32-bit unsigned integers INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) diff --git a/src/jit/simd.cpp b/src/jit/simd.cpp index fb190c4fa1..4ba7832cca 100644 --- a/src/jit/simd.cpp +++ b/src/jit/simd.cpp @@ -1374,20 +1374,22 @@ GenTreePtr Compiler::impSIMDMinMax(SIMDIntrinsicID intrinsicId, #ifdef _TARGET_XARCH_ // SSE2 has direct support for float/double/signed word/unsigned byte. + // SSE4.1 has direct support for int32/uint32/signed byte/unsigned word. // For other integer types we compute min/max as follows // - // int32/uint32/int64/uint64: + // int32/uint32 (SSE2) + // int64/uint64 (SSE2&SSE3_4): // compResult = (op1 < op2) in case of Min // (op1 > op2) in case of Max // Min/Max(op1, op2) = Select(compResult, op1, op2) // - // unsigned word: + // unsigned word (SSE2): // op1 = op1 - 2^15 ; to make it fit within a signed word // op2 = op2 - 2^15 ; to make it fit within a signed word // result = SSE2 signed word Min/Max(op1, op2) // result = result + 2^15 ; readjust it back // - // signed byte: + // signed byte (SSE2): // op1 = op1 + 2^7 ; to make it unsigned // op1 = op1 + 2^7 ; to make it unsigned // result = SSE2 unsigned byte Min/Max(op1, op2) @@ -1395,13 +1397,16 @@ GenTreePtr Compiler::impSIMDMinMax(SIMDIntrinsicID intrinsicId, GenTree* simdTree = nullptr; - if (varTypeIsFloating(baseType) || baseType == TYP_SHORT || baseType == TYP_UBYTE) + if (varTypeIsFloating(baseType) || baseType == TYP_SHORT || baseType == TYP_UBYTE || + (getSIMDInstructionSet() >= InstructionSet_SSE3_4 && + (baseType == TYP_BYTE || baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_CHAR))) { - // SSE2 has direct support + // SSE2 or SSE4.1 has direct support simdTree = gtNewSIMDNode(simdType, op1, op2, intrinsicId, baseType, size); } else if (baseType == TYP_CHAR || baseType == TYP_BYTE) { + assert(getSIMDInstructionSet() == InstructionSet_SSE2); int constVal; SIMDIntrinsicID operIntrinsic; SIMDIntrinsicID adjustIntrinsic; diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp index ace36422fb..468d302d17 100644 --- a/src/jit/simdcodegenxarch.cpp +++ b/src/jit/simdcodegenxarch.cpp @@ -243,6 +243,25 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type { result = INS_pminsw; } + else if (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4) + { + if (baseType == TYP_BYTE) + { + result = INS_pminsb; + } + else if (baseType == TYP_CHAR) + { + result = INS_pminuw; + } + else if (baseType == TYP_INT) + { + result = INS_pminsd; + } + else if (baseType == TYP_UINT) + { + result = INS_pminud; + } + } else { unreached(); @@ -266,6 +285,25 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type { result = INS_pmaxsw; } + else if (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4) + { + if (baseType == TYP_BYTE) + { + result = INS_pmaxsb; + } + else if (baseType == TYP_CHAR) + { + result = INS_pmaxuw; + } + else if (baseType == TYP_INT) + { + result = INS_pmaxsd; + } + else if (baseType == TYP_UINT) + { + result = INS_pmaxud; + } + } else { unreached(); |