summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorTanner Gooding <tagoo@outlook.com>2018-02-25 09:27:18 -0800
committerTanner Gooding <tagoo@outlook.com>2018-02-28 07:19:27 -0800
commitc23b4d6780095ac1e58f785e8f635d94120e1f64 (patch)
tree9e3673e005498bd34b080280a74cb70567dbf2a1 /src
parent7207e1b422de2d2ad451605d5753a7e7b1883a08 (diff)
downloadcoreclr-c23b4d6780095ac1e58f785e8f635d94120e1f64.tar.gz
coreclr-c23b4d6780095ac1e58f785e8f635d94120e1f64.tar.bz2
coreclr-c23b4d6780095ac1e58f785e8f635d94120e1f64.zip
Adding partial support for the SSE41 hardware intrinsics
Diffstat (limited to 'src')
-rw-r--r--src/jit/emitxarch.cpp4
-rw-r--r--src/jit/hwintrinsiccodegenxarch.cpp97
-rw-r--r--src/jit/hwintrinsiclistxarch.h45
-rw-r--r--src/jit/hwintrinsicxarch.cpp39
-rw-r--r--src/jit/instrsxarch.h16
-rw-r--r--src/jit/lsraxarch.cpp7
6 files changed, 200 insertions, 8 deletions
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp
index 7e47ccfb7d..2753c2dc81 100644
--- a/src/jit/emitxarch.cpp
+++ b/src/jit/emitxarch.cpp
@@ -87,6 +87,8 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
case INS_andnps:
case INS_andpd:
case INS_andps:
+ case INS_blendpd:
+ case INS_blendps:
case INS_cmppd:
case INS_cmpps:
case INS_cmpsd:
@@ -114,6 +116,7 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
case INS_minss:
case INS_movhlps:
case INS_movlhps:
+ case INS_mpsadbw:
case INS_mulpd:
case INS_mulps:
case INS_mulsd:
@@ -137,6 +140,7 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
case INS_pandn:
case INS_pavgb:
case INS_pavgw:
+ case INS_pblendw:
case INS_pcmpeqb:
case INS_pcmpeqd:
case INS_pcmpeqq:
diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp
index dbc2e35fcb..2c6a184b3a 100644
--- a/src/jit/hwintrinsiccodegenxarch.cpp
+++ b/src/jit/hwintrinsiccodegenxarch.cpp
@@ -91,6 +91,10 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
{
emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
}
+ else if ((ival != -1) && varTypeIsFloating(baseType))
+ {
+ emit->emitIns_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
+ }
else
{
emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
@@ -1027,7 +1031,98 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
//
void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
{
- NYI("Implement SSE41 intrinsic code generation");
+ NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
+ GenTree* op1 = node->gtGetOp1();
+ GenTree* op2 = node->gtGetOp2();
+ GenTree* op3 = nullptr;
+ GenTree* op4 = nullptr;
+ regNumber targetReg = node->gtRegNum;
+ var_types targetType = node->TypeGet();
+ var_types baseType = node->gtSIMDBaseType;
+
+ regNumber op1Reg = REG_NA;
+ regNumber op2Reg = REG_NA;
+ regNumber op3Reg = REG_NA;
+ regNumber op4Reg = REG_NA;
+ emitter* emit = getEmitter();
+
+ if ((op1 != nullptr) && !op1->OperIsList())
+ {
+ op1Reg = op1->gtRegNum;
+ genConsumeOperands(node);
+ }
+
+ switch (intrinsicID)
+ {
+ case NI_SSE41_CeilingScalar:
+ case NI_SSE41_FloorScalar:
+ case NI_SSE41_RoundCurrentDirectionScalar:
+ case NI_SSE41_RoundToNearestIntegerScalar:
+ case NI_SSE41_RoundToNegativeInfinityScalar:
+ case NI_SSE41_RoundToPositiveInfinityScalar:
+ case NI_SSE41_RoundToZeroScalar:
+ {
+ assert((baseType == TYP_FLOAT) || (baseType == TYP_DOUBLE));
+ instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
+
+ if (op2 == nullptr)
+ {
+ int ival = Compiler::ivalOfHWIntrinsic(intrinsicID);
+ emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op1Reg, ival);
+ }
+ else
+ {
+ genHWIntrinsic_R_R_RM_I(node, ins);
+ }
+ break;
+ }
+
+ case NI_SSE41_TestAllOnes:
+ {
+ regNumber tmpReg = node->GetSingleTempReg();
+ assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
+ emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
+ emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
+ emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
+ emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
+ break;
+ }
+
+ case NI_SSE41_TestAllZeros:
+ case NI_SSE41_TestZ:
+ {
+ assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
+ emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
+ emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
+ emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
+ break;
+ }
+
+ case NI_SSE41_TestC:
+ {
+ assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
+ emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
+ emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
+ emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
+ break;
+ }
+
+ case NI_SSE41_TestMixOnesZeros:
+ case NI_SSE41_TestNotZAndNotC:
+ {
+ assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
+ emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
+ emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
+ emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
+ break;
+ }
+
+ default:
+ unreached();
+ break;
+ }
+
+ genProduceReg(node);
}
//------------------------------------------------------------------------
diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h
index 0c28863f17..be9d031136 100644
--- a/src/jit/hwintrinsiclistxarch.h
+++ b/src/jit/hwintrinsiclistxarch.h
@@ -261,12 +261,47 @@ HARDWARE_INTRINSIC(SSSE3_MultiplyHighRoundScale, "MultiplyHi
HARDWARE_INTRINSIC(SSSE3_Shuffle, "Shuffle", SSSE3, -1, 16, 2, {INS_pshufb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSSE3_Sign, "Sign", SSSE3, -1, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
+// Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags
+// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
+// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// SSE41 Intrinsics
-HARDWARE_INTRINSIC(SSE41_IsSupported, "get_IsSupported", SSE41, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE41_BlendVariable, "BlendVariable", SSE41, -1, 16, 3, {INS_pblendvb, INS_pblendvb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE41_CompareEqual, "CompareEqual", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
-HARDWARE_INTRINSIC(SSE41_LoadAlignedVector128NonTemporal, "LoadAlignedVector128NonTemporal", SSE41, -1, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE41_Multiply, "Multiply", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE41_IsSupported, "get_IsSupported", SSE41, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_Blend, "Blend", SSE41, -1, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(SSE41_BlendVariable, "BlendVariable", SSE41, -1, 16, 3, {INS_pblendvb, INS_pblendvb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_Ceiling, "Ceiling", SSE41, 10, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_CeilingScalar, "CeilingScalar", SSE41, 10, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE41_CompareEqual, "CompareEqual", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int16, "ConvertToVector128Int16", SSE41, -1, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg)
+HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int32, "ConvertToVector128Int32", SSE41, -1, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg)
+HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int64, "ConvertToVector128Int64", SSE41, -1, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg)
+HARDWARE_INTRINSIC(SSE41_DotProduct, "DotProduct", SSE41, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(SSE41_Floor, "Floor", SSE41, 9, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_FloorScalar, "FloorScalar", SSE41, 9, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE41_LoadAlignedVector128NonTemporal, "LoadAlignedVector128NonTemporal", SSE41, -1, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_Max, "Max", SSE41, -1, 16, 2, {INS_pmaxsb, INS_invalid, INS_invalid, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_Min, "Min", SSE41, -1, 16, 2, {INS_pminsb, INS_invalid, INS_invalid, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_MinHorizontal, "MinHorizontal", SSE41, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_MultipleSumAbsoluteDifferences, "MultipleSumAbsoluteDifferences", SSE41, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(SSE41_Multiply, "Multiply", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE41_MultiplyLow, "MultiplyLow", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_PackUnsignedSaturate, "PackUnsignedSaturate", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_RoundCurrentDirection, "RoundCurrentDirection", SSE41, 4, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_RoundCurrentDirectionScalar, "RoundCurrentDirectionScalar", SSE41, 4, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE41_RoundToNearestInteger, "RoundToNearestInteger", SSE41, 8, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_RoundToNearestIntegerScalar, "RoundToNearestIntegerScalar", SSE41, 8, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE41_RoundToNegativeInfinity, "RoundToNegativeInfinity", SSE41, 9, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_RoundToNegativeInfinityScalar, "RoundToNegativeInfinityScalar", SSE41, 9, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE41_RoundToPositiveInfinity, "RoundToPositiveInfinity", SSE41, 10, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_RoundToPositiveInfinityScalar, "RoundToPositiveInfinityScalar", SSE41, 10, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE41_RoundToZero, "RoundToZero", SSE41, 11, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE41_RoundToZeroScalar, "RoundToZeroScalar", SSE41, 11, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE41_TestAllOnes, "TestAllOnes", SSE41, -1, 16, 1, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE41_TestAllZeros, "TestAllZeros", SSE41, -1, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE41_TestC, "TestC", SSE41, -1, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE41_TestMixOnesZeros, "TestMixOnesZeros", SSE41, -1, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE41_TestNotZAndNotC, "TestNotZAndNotC", SSE41, -1, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE41_TestZ, "TestZ", SSE41, -1, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment)
// SSE42 Intrinsics
HARDWARE_INTRINSIC(SSE42_IsSupported, "get_IsSupported", SSE42, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp
index b7fbcfcf70..c4b482325c 100644
--- a/src/jit/hwintrinsicxarch.cpp
+++ b/src/jit/hwintrinsicxarch.cpp
@@ -1083,7 +1083,44 @@ GenTree* Compiler::impSSE41Intrinsic(NamedIntrinsic intrinsic,
CORINFO_SIG_INFO* sig,
bool mustExpand)
{
- return nullptr;
+ GenTree* retNode = nullptr;
+ GenTree* op1 = nullptr;
+ GenTree* op2 = nullptr;
+ GenTree* op3 = nullptr;
+ GenTree* op4 = nullptr;
+ int simdSize = simdSizeOfHWIntrinsic(intrinsic, sig);
+
+ assert(simdSize == 16);
+
+ switch (intrinsic)
+ {
+ case NI_SSE41_CeilingScalar:
+ case NI_SSE41_FloorScalar:
+ case NI_SSE41_RoundCurrentDirectionScalar:
+ case NI_SSE41_RoundToNearestIntegerScalar:
+ case NI_SSE41_RoundToNegativeInfinityScalar:
+ case NI_SSE41_RoundToPositiveInfinityScalar:
+ case NI_SSE41_RoundToZeroScalar:
+ {
+ assert((sig->numArgs == 1) || (sig->numArgs == 2));
+ var_types baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
+ assert((baseType == TYP_FLOAT) || (baseType == TYP_DOUBLE));
+
+ if (sig->numArgs == 2)
+ {
+ op2 = impSIMDPopStack(TYP_SIMD16);
+ }
+
+ op1 = impSIMDPopStack(TYP_SIMD16);
+ retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, baseType, simdSize);
+ break;
+ }
+
+ default:
+ JITDUMP("Not implemented hardware intrinsic");
+ break;
+ }
+ return retNode;
}
GenTree* Compiler::impSSE42Intrinsic(NamedIntrinsic intrinsic,
diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h
index 3d314a699f..ae8adffea6 100644
--- a/src/jit/instrsxarch.h
+++ b/src/jit/instrsxarch.h
@@ -416,16 +416,28 @@ INST3( pmaxsd, "pmaxsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS
INST3( pmaxuw, "pmaxuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3E)) // packed maximum 16-bit unsigned integers
INST3( pmaxud, "pmaxud" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3F)) // packed maximum 32-bit unsigned integers
INST3( pmovsxbw, "pmovsxbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x20)) // Packed sign extend byte to short
+INST3( pmovsxbd, "pmovsxbd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x21)) // Packed sign extend byte to int
+INST3( pmovsxbq, "pmovsxbq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x22)) // Packed sign extend byte to long
INST3( pmovsxwd, "pmovsxwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x23)) // Packed sign extend short to int
+INST3( pmovsxwq, "pmovsxwq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x24)) // Packed sign extend short to long
INST3( pmovsxdq, "pmovsxdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x25)) // Packed sign extend int to long
+INST3( pmovzxbw, "pmovzxbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x30)) // Packed zero extend byte to short
+INST3( pmovzxbd, "pmovzxbd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x31)) // Packed zero extend byte to intg
+INST3( pmovzxbq, "pmovzxbq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x32)) // Packed zero extend byte to lon
+INST3( pmovzxwd, "pmovzxwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x33)) // Packed zero extend short to int
+INST3( pmovzxwq, "pmovzxwq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x34)) // Packed zero extend short to long
+INST3( pmovzxdq, "pmovzxdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x35)) // Packed zero extend int to long
INST3( packusdw, "packusdw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2B)) // Pack (narrow) int to unsigned short with saturation
INST3( roundps, "roundps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x08)) // Round packed single precision floating-point values
INST3( roundss, "roundss" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0A)) // Round scalar single precision floating-point values
INST3( roundpd, "roundpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x09)) // Round packed double precision floating-point values
INST3( roundsd, "roundsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0B)) // Round scalar double precision floating-point values
INST3( pmuldq, "pmuldq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x28)) // packed multiply 32-bit signed integers and store 64-bit result
+INST3( blendps, "blendps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0C)) // Blend Packed Single Precision Floating-Point Values
INST3( blendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x14)) // Variable Blend Packed Singles
+INST3( blendpd, "blendpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0D)) // Blend Packed Double Precision Floating-Point Values
INST3( blendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x15)) // Variable Blend Packed Doubles
+INST3( pblendw, "pblendw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0E)) // Blend Packed Words
INST3( pblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x10)) // Variable Blend Packed Bytes
INST3( phaddw, "phaddw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x01)) // Packed horizontal add of 16-bit integers
INST3( phsubw, "phsubw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x05)) // Packed horizontal subtract of 16-bit integers
@@ -434,9 +446,11 @@ INST3( phaddsw, "phaddsw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS
INST3( phsubsw, "phsubsw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x07)) // Packed horizontal subtract of 16-bit integers with saturation
INST3( lddqu, "lddqu" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEDBL(0xF0)) // Load Unaligned integer
INST3( movntdqa, "movntdqa" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2A)) // Load Double Quadword Non-Temporal Aligned Hint
-INST3( movddup, "movddup" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEDBL(0x12)) // Replicate Double FP Values
+INST3( movddup, "movddup" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEDBL(0x12)) // Replicate Double FP Values
INST3( movsldup, "movsldup" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEFLT(0x12)) // Replicate even-indexed Single FP Values
INST3( movshdup, "movshdup" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEFLT(0x16)) // Replicate odd-indexed Single FP Values
+INST3( phminposuw, "phminposuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x41)) // Packed Horizontal Word Minimum
+INST3( mpsadbw, "mpsadbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x42)) // Compute Multiple Packed Sums of Absolute Difference
INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp
index 6ff82a78ee..e5a74fdf84 100644
--- a/src/jit/lsraxarch.cpp
+++ b/src/jit/lsraxarch.cpp
@@ -2344,6 +2344,13 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
}
break;
+ case NI_SSE41_TestAllOnes:
+ {
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(this, allSIMDRegs());
+ break;
+ }
+
#ifdef _TARGET_X86_
case NI_SSE42_Crc32:
{