summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFei Peng <fei.peng@intel.com>2017-02-13 16:07:56 -0800
committerFei Peng <fei.peng@intel.com>2017-02-13 17:43:44 -0800
commite6fe29c6c6ca6e4ebc47e0556dfc196055e8dae7 (patch)
tree418fd6406306e4f75b17b722541028490f0102b5
parent539b142c53c66702fd9e76335f591dd946ddbdd5 (diff)
downloadcoreclr-e6fe29c6c6ca6e4ebc47e0556dfc196055e8dae7.tar.gz
coreclr-e6fe29c6c6ca6e4ebc47e0556dfc196055e8dae7.tar.bz2
coreclr-e6fe29c6c6ca6e4ebc47e0556dfc196055e8dae7.zip
Directly support Min/Max intrinsic for Vector<T> on SSE3_4 and above targets
-rw-r--r--src/jit/emitxarch.cpp16
-rw-r--r--src/jit/instrsxarch.h8
-rw-r--r--src/jit/simd.cpp15
-rw-r--r--src/jit/simdcodegenxarch.cpp38
4 files changed, 67 insertions, 10 deletions
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp
index be5cefbfea..796af19a92 100644
--- a/src/jit/emitxarch.cpp
+++ b/src/jit/emitxarch.cpp
@@ -81,9 +81,11 @@ bool emitter::IsThreeOperandBinaryAVXInstruction(instruction ins)
ins == INS_minsd || ins == INS_divps || ins == INS_divpd || ins == INS_maxps || ins == INS_maxpd ||
ins == INS_maxss || ins == INS_maxsd || ins == INS_andnps || ins == INS_andnpd || ins == INS_paddb ||
ins == INS_paddw || ins == INS_paddd || ins == INS_paddq || ins == INS_psubb || ins == INS_psubw ||
- ins == INS_psubd || ins == INS_psubq || ins == INS_pmuludq || ins == INS_pxor || ins == INS_pmaxub ||
- ins == INS_pminub || ins == INS_pmaxsw || ins == INS_pminsw || ins == INS_insertps ||
- ins == INS_vinsertf128 || ins == INS_punpckldq || ins == INS_phaddd);
+ ins == INS_psubd || ins == INS_psubq || ins == INS_pmuludq || ins == INS_pxor || ins == INS_insertps ||
+ ins == INS_vinsertf128 || ins == INS_punpckldq || ins == INS_phaddd || ins == INS_pminub ||
+ ins == INS_pminsw || ins == INS_pminsb || ins == INS_pminsd || ins == INS_pminuw || ins == INS_pminud ||
+ ins == INS_pmaxub || ins == INS_pmaxsw || ins == INS_pmaxsb || ins == INS_pmaxsd || ins == INS_pmaxuw ||
+ ins == INS_pmaxud);
}
// Returns true if the AVX instruction is a move operator that requires 3 operands.
@@ -115,7 +117,9 @@ bool emitter::Is4ByteAVXInstruction(instruction ins)
(ins == INS_dpps || ins == INS_dppd || ins == INS_insertps || ins == INS_pcmpeqq || ins == INS_pcmpgtq ||
ins == INS_vbroadcastss || ins == INS_vbroadcastsd || ins == INS_vpbroadcastb || ins == INS_vpbroadcastw ||
ins == INS_vpbroadcastd || ins == INS_vpbroadcastq || ins == INS_vextractf128 || ins == INS_vinsertf128 ||
- ins == INS_pmulld || ins == INS_ptest || ins == INS_phaddd);
+ ins == INS_pmulld || ins == INS_ptest || ins == INS_phaddd || ins == INS_pminsb || ins == INS_pminsd ||
+ ins == INS_pminuw || ins == INS_pminud || ins == INS_pmaxsb || ins == INS_pmaxsd || ins == INS_pmaxuw ||
+ ins == INS_pmaxud);
}
#endif // FEATURE_AVX_SUPPORT
@@ -135,7 +139,9 @@ bool emitter::Is4ByteSSE4Instruction(instruction ins)
return false;
#else
return UseSSE3_4() && (ins == INS_dpps || ins == INS_dppd || ins == INS_insertps || ins == INS_pcmpeqq ||
- ins == INS_pcmpgtq || ins == INS_pmulld || ins == INS_ptest || ins == INS_phaddd);
+ ins == INS_pcmpgtq || ins == INS_pmulld || ins == INS_ptest || ins == INS_phaddd ||
+ ins == INS_pminsb || ins == INS_pminsd || ins == INS_pminuw || ins == INS_pminud ||
+ ins == INS_pmaxsb || ins == INS_pmaxsd || ins == INS_pmaxuw || ins == INS_pmaxud);
#endif
}
diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h
index 8ab3a845ba..70a7243866 100644
--- a/src/jit/instrsxarch.h
+++ b/src/jit/instrsxarch.h
@@ -323,6 +323,14 @@ INST3( phaddd, "phaddd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS
INST3( pabsb, "pabsb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x1C)) // Packed absolute value of bytes
INST3( pabsw, "pabsw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x1D)) // Packed absolute value of 16-bit integers
INST3( pabsd, "pabsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x1E)) // Packed absolute value of 32-bit integers
+INST3( pminsb, "pminsb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x38)) // packed minimum signed bytes
+INST3( pminsd, "pminsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x39)) // packed minimum 32-bit signed integers
+INST3( pminuw, "pminuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3A)) // packed minimum 16-bit unsigned integers
+INST3( pminud, "pminud" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3B)) // packed minimum 32-bit unsigned integers
+INST3( pmaxsb, "pmaxsb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3C)) // packed maximum signed bytes
+INST3( pmaxsd, "pmaxsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3D)) // packed maximum 32-bit signed integers
+INST3( pmaxuw, "pmaxuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3E)) // packed maximum 16-bit unsigned integers
+INST3( pmaxud, "pmaxud" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3F)) // packed maximum 32-bit unsigned integers
INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
diff --git a/src/jit/simd.cpp b/src/jit/simd.cpp
index fb190c4fa1..4ba7832cca 100644
--- a/src/jit/simd.cpp
+++ b/src/jit/simd.cpp
@@ -1374,20 +1374,22 @@ GenTreePtr Compiler::impSIMDMinMax(SIMDIntrinsicID intrinsicId,
#ifdef _TARGET_XARCH_
// SSE2 has direct support for float/double/signed word/unsigned byte.
+ // SSE4.1 has direct support for int32/uint32/signed byte/unsigned word.
// For other integer types we compute min/max as follows
//
- // int32/uint32/int64/uint64:
+ // int32/uint32 (SSE2)
+ // int64/uint64 (SSE2&SSE3_4):
// compResult = (op1 < op2) in case of Min
// (op1 > op2) in case of Max
// Min/Max(op1, op2) = Select(compResult, op1, op2)
//
- // unsigned word:
+ // unsigned word (SSE2):
// op1 = op1 - 2^15 ; to make it fit within a signed word
// op2 = op2 - 2^15 ; to make it fit within a signed word
// result = SSE2 signed word Min/Max(op1, op2)
// result = result + 2^15 ; readjust it back
//
- // signed byte:
+ // signed byte (SSE2):
// op1 = op1 + 2^7 ; to make it unsigned
// op1 = op1 + 2^7 ; to make it unsigned
// result = SSE2 unsigned byte Min/Max(op1, op2)
@@ -1395,13 +1397,16 @@ GenTreePtr Compiler::impSIMDMinMax(SIMDIntrinsicID intrinsicId,
GenTree* simdTree = nullptr;
- if (varTypeIsFloating(baseType) || baseType == TYP_SHORT || baseType == TYP_UBYTE)
+ if (varTypeIsFloating(baseType) || baseType == TYP_SHORT || baseType == TYP_UBYTE ||
+ (getSIMDInstructionSet() >= InstructionSet_SSE3_4 &&
+ (baseType == TYP_BYTE || baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_CHAR)))
{
- // SSE2 has direct support
+ // SSE2 or SSE4.1 has direct support
simdTree = gtNewSIMDNode(simdType, op1, op2, intrinsicId, baseType, size);
}
else if (baseType == TYP_CHAR || baseType == TYP_BYTE)
{
+ assert(getSIMDInstructionSet() == InstructionSet_SSE2);
int constVal;
SIMDIntrinsicID operIntrinsic;
SIMDIntrinsicID adjustIntrinsic;
diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp
index ace36422fb..468d302d17 100644
--- a/src/jit/simdcodegenxarch.cpp
+++ b/src/jit/simdcodegenxarch.cpp
@@ -243,6 +243,25 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
{
result = INS_pminsw;
}
+ else if (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4)
+ {
+ if (baseType == TYP_BYTE)
+ {
+ result = INS_pminsb;
+ }
+ else if (baseType == TYP_CHAR)
+ {
+ result = INS_pminuw;
+ }
+ else if (baseType == TYP_INT)
+ {
+ result = INS_pminsd;
+ }
+ else if (baseType == TYP_UINT)
+ {
+ result = INS_pminud;
+ }
+ }
else
{
unreached();
@@ -266,6 +285,25 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
{
result = INS_pmaxsw;
}
+ else if (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4)
+ {
+ if (baseType == TYP_BYTE)
+ {
+ result = INS_pmaxsb;
+ }
+ else if (baseType == TYP_CHAR)
+ {
+ result = INS_pmaxuw;
+ }
+ else if (baseType == TYP_INT)
+ {
+ result = INS_pmaxsd;
+ }
+ else if (baseType == TYP_UINT)
+ {
+ result = INS_pmaxud;
+ }
+ }
else
{
unreached();