diff options
-rw-r--r-- | src/jit/emitxarch.cpp | 4 | ||||
-rw-r--r-- | src/jit/hwintrinsiccodegenxarch.cpp | 97 | ||||
-rw-r--r-- | src/jit/hwintrinsiclistxarch.h | 45 | ||||
-rw-r--r-- | src/jit/hwintrinsicxarch.cpp | 39 | ||||
-rw-r--r-- | src/jit/instrsxarch.h | 16 | ||||
-rw-r--r-- | src/jit/lsraxarch.cpp | 7 |
6 files changed, 200 insertions, 8 deletions
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index 7e47ccfb7d..2753c2dc81 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -87,6 +87,8 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) case INS_andnps: case INS_andpd: case INS_andps: + case INS_blendpd: + case INS_blendps: case INS_cmppd: case INS_cmpps: case INS_cmpsd: @@ -114,6 +116,7 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) case INS_minss: case INS_movhlps: case INS_movlhps: + case INS_mpsadbw: case INS_mulpd: case INS_mulps: case INS_mulsd: @@ -137,6 +140,7 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) case INS_pandn: case INS_pavgb: case INS_pavgw: + case INS_pblendw: case INS_pcmpeqb: case INS_pcmpeqd: case INS_pcmpeqq: diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp index dbc2e35fcb..2c6a184b3a 100644 --- a/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/jit/hwintrinsiccodegenxarch.cpp @@ -91,6 +91,10 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg); } + else if ((ival != -1) && varTypeIsFloating(baseType)) + { + emit->emitIns_R_R_I(ins, simdSize, targetReg, op1Reg, ival); + } else { emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg); @@ -1027,7 +1031,98 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node) // void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node) { - NYI("Implement SSE41 intrinsic code generation"); + NamedIntrinsic intrinsicID = node->gtHWIntrinsicId; + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + GenTree* op3 = nullptr; + GenTree* op4 = nullptr; + regNumber targetReg = node->gtRegNum; + var_types targetType = node->TypeGet(); + var_types baseType = node->gtSIMDBaseType; + + regNumber op1Reg = REG_NA; + regNumber op2Reg = REG_NA; + regNumber op3Reg = REG_NA; + regNumber op4Reg = REG_NA; + emitter* emit = getEmitter(); + + if ((op1 != nullptr) && !op1->OperIsList()) + { + op1Reg = op1->gtRegNum; + genConsumeOperands(node); + } + + switch (intrinsicID) + { + case NI_SSE41_CeilingScalar: + case NI_SSE41_FloorScalar: + case NI_SSE41_RoundCurrentDirectionScalar: + case NI_SSE41_RoundToNearestIntegerScalar: + case NI_SSE41_RoundToNegativeInfinityScalar: + case NI_SSE41_RoundToPositiveInfinityScalar: + case NI_SSE41_RoundToZeroScalar: + { + assert((baseType == TYP_FLOAT) || (baseType == TYP_DOUBLE)); + instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); + + if (op2 == nullptr) + { + int ival = Compiler::ivalOfHWIntrinsic(intrinsicID); + emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op1Reg, ival); + } + else + { + genHWIntrinsic_R_R_RM_I(node, ins); + } + break; + } + + case NI_SSE41_TestAllOnes: + { + regNumber tmpReg = node->GetSingleTempReg(); + assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest); + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg); + emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg); + emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg); + emit->emitIns_R(INS_setb, EA_1BYTE, targetReg); + break; + } + + case NI_SSE41_TestAllZeros: + case NI_SSE41_TestZ: + { + assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest); + emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg); + emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum); + emit->emitIns_R(INS_sete, EA_1BYTE, targetReg); + break; + } + + case NI_SSE41_TestC: + { + assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest); + emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg); + emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum); + emit->emitIns_R(INS_setb, EA_1BYTE, targetReg); + break; + } + + case NI_SSE41_TestMixOnesZeros: + case NI_SSE41_TestNotZAndNotC: + { + assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest); + emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg); + emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum); + emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); + break; + } + + default: + unreached(); + break; + } + + genProduceReg(node); } //------------------------------------------------------------------------ diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index 0c28863f17..be9d031136 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -261,12 +261,47 @@ HARDWARE_INTRINSIC(SSSE3_MultiplyHighRoundScale, "MultiplyHi HARDWARE_INTRINSIC(SSSE3_Shuffle, "Shuffle", SSSE3, -1, 16, 2, {INS_pshufb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3_Sign, "Sign", SSSE3, -1, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************ +// Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************ // SSE41 Intrinsics -HARDWARE_INTRINSIC(SSE41_IsSupported, "get_IsSupported", SSE41, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41_BlendVariable, "BlendVariable", SSE41, -1, 16, 3, {INS_pblendvb, INS_pblendvb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41_CompareEqual, "CompareEqual", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE41_LoadAlignedVector128NonTemporal, "LoadAlignedVector128NonTemporal", SSE41, -1, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41_Multiply, "Multiply", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE41_IsSupported, "get_IsSupported", SSE41, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_Blend, "Blend", SSE41, -1, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41_BlendVariable, "BlendVariable", SSE41, -1, 16, 3, {INS_pblendvb, INS_pblendvb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_Ceiling, "Ceiling", SSE41, 10, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_CeilingScalar, "CeilingScalar", SSE41, 10, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41_CompareEqual, "CompareEqual", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int16, "ConvertToVector128Int16", SSE41, -1, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg) +HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int32, "ConvertToVector128Int32", SSE41, -1, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg) +HARDWARE_INTRINSIC(SSE41_ConvertToVector128Int64, "ConvertToVector128Int64", SSE41, -1, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromArg) +HARDWARE_INTRINSIC(SSE41_DotProduct, "DotProduct", SSE41, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41_Floor, "Floor", SSE41, 9, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_FloorScalar, "FloorScalar", SSE41, 9, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41_LoadAlignedVector128NonTemporal, "LoadAlignedVector128NonTemporal", SSE41, -1, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_Max, "Max", SSE41, -1, 16, 2, {INS_pmaxsb, INS_invalid, INS_invalid, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_Min, "Min", SSE41, -1, 16, 2, {INS_pminsb, INS_invalid, INS_invalid, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_MinHorizontal, "MinHorizontal", SSE41, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_MultipleSumAbsoluteDifferences, "MultipleSumAbsoluteDifferences", SSE41, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41_Multiply, "Multiply", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE41_MultiplyLow, "MultiplyLow", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_PackUnsignedSaturate, "PackUnsignedSaturate", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_RoundCurrentDirection, "RoundCurrentDirection", SSE41, 4, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_RoundCurrentDirectionScalar, "RoundCurrentDirectionScalar", SSE41, 4, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41_RoundToNearestInteger, "RoundToNearestInteger", SSE41, 8, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_RoundToNearestIntegerScalar, "RoundToNearestIntegerScalar", SSE41, 8, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41_RoundToNegativeInfinity, "RoundToNegativeInfinity", SSE41, 9, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_RoundToNegativeInfinityScalar, "RoundToNegativeInfinityScalar", SSE41, 9, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41_RoundToPositiveInfinity, "RoundToPositiveInfinity", SSE41, 10, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_RoundToPositiveInfinityScalar, "RoundToPositiveInfinityScalar", SSE41, 10, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41_RoundToZero, "RoundToZero", SSE41, 11, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_RoundToZeroScalar, "RoundToZeroScalar", SSE41, 11, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_Special, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41_TestAllOnes, "TestAllOnes", SSE41, -1, 16, 1, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE41_TestAllZeros, "TestAllZeros", SSE41, -1, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE41_TestC, "TestC", SSE41, -1, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE41_TestMixOnesZeros, "TestMixOnesZeros", SSE41, -1, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE41_TestNotZAndNotC, "TestNotZAndNotC", SSE41, -1, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE41_TestZ, "TestZ", SSE41, -1, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MultiIns|HW_Flag_NoContainment) // SSE42 Intrinsics HARDWARE_INTRINSIC(SSE42_IsSupported, "get_IsSupported", SSE42, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp index b7fbcfcf70..c4b482325c 100644 --- a/src/jit/hwintrinsicxarch.cpp +++ b/src/jit/hwintrinsicxarch.cpp @@ -1083,7 +1083,44 @@ GenTree* Compiler::impSSE41Intrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig, bool mustExpand) { - return nullptr; + GenTree* retNode = nullptr; + GenTree* op1 = nullptr; + GenTree* op2 = nullptr; + GenTree* op3 = nullptr; + GenTree* op4 = nullptr; + int simdSize = simdSizeOfHWIntrinsic(intrinsic, sig); + + assert(simdSize == 16); + + switch (intrinsic) + { + case NI_SSE41_CeilingScalar: + case NI_SSE41_FloorScalar: + case NI_SSE41_RoundCurrentDirectionScalar: + case NI_SSE41_RoundToNearestIntegerScalar: + case NI_SSE41_RoundToNegativeInfinityScalar: + case NI_SSE41_RoundToPositiveInfinityScalar: + case NI_SSE41_RoundToZeroScalar: + { + assert((sig->numArgs == 1) || (sig->numArgs == 2)); + var_types baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass); + assert((baseType == TYP_FLOAT) || (baseType == TYP_DOUBLE)); + + if (sig->numArgs == 2) + { + op2 = impSIMDPopStack(TYP_SIMD16); + } + + op1 = impSIMDPopStack(TYP_SIMD16); + retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, baseType, simdSize); + break; + } + + default: + JITDUMP("Not implemented hardware intrinsic"); + break; + } + return retNode; } GenTree* Compiler::impSSE42Intrinsic(NamedIntrinsic intrinsic, diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index 3d314a699f..ae8adffea6 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -416,16 +416,28 @@ INST3( pmaxsd, "pmaxsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( pmaxuw, "pmaxuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3E)) // packed maximum 16-bit unsigned integers INST3( pmaxud, "pmaxud" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3F)) // packed maximum 32-bit unsigned integers INST3( pmovsxbw, "pmovsxbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x20)) // Packed sign extend byte to short +INST3( pmovsxbd, "pmovsxbd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x21)) // Packed sign extend byte to int +INST3( pmovsxbq, "pmovsxbq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x22)) // Packed sign extend byte to long INST3( pmovsxwd, "pmovsxwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x23)) // Packed sign extend short to int +INST3( pmovsxwq, "pmovsxwq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x24)) // Packed sign extend short to long INST3( pmovsxdq, "pmovsxdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x25)) // Packed sign extend int to long +INST3( pmovzxbw, "pmovzxbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x30)) // Packed zero extend byte to short +INST3( pmovzxbd, "pmovzxbd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x31)) // Packed zero extend byte to intg +INST3( pmovzxbq, "pmovzxbq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x32)) // Packed zero extend byte to lon +INST3( pmovzxwd, "pmovzxwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x33)) // Packed zero extend short to int +INST3( pmovzxwq, "pmovzxwq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x34)) // Packed zero extend short to long +INST3( pmovzxdq, "pmovzxdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x35)) // Packed zero extend int to long INST3( packusdw, "packusdw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2B)) // Pack (narrow) int to unsigned short with saturation INST3( roundps, "roundps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x08)) // Round packed single precision floating-point values INST3( roundss, "roundss" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0A)) // Round scalar single precision floating-point values INST3( roundpd, "roundpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x09)) // Round packed double precision floating-point values INST3( roundsd, "roundsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0B)) // Round scalar double precision floating-point values INST3( pmuldq, "pmuldq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x28)) // packed multiply 32-bit signed integers and store 64-bit result +INST3( blendps, "blendps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0C)) // Blend Packed Single Precision Floating-Point Values INST3( blendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x14)) // Variable Blend Packed Singles +INST3( blendpd, "blendpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0D)) // Blend Packed Double Precision Floating-Point Values INST3( blendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x15)) // Variable Blend Packed Doubles +INST3( pblendw, "pblendw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0E)) // Blend Packed Words INST3( pblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x10)) // Variable Blend Packed Bytes INST3( phaddw, "phaddw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x01)) // Packed horizontal add of 16-bit integers INST3( phsubw, "phsubw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x05)) // Packed horizontal subtract of 16-bit integers @@ -434,9 +446,11 @@ INST3( phaddsw, "phaddsw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( phsubsw, "phsubsw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x07)) // Packed horizontal subtract of 16-bit integers with saturation INST3( lddqu, "lddqu" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEDBL(0xF0)) // Load Unaligned integer INST3( movntdqa, "movntdqa" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2A)) // Load Double Quadword Non-Temporal Aligned Hint -INST3( movddup, "movddup" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEDBL(0x12)) // Replicate Double FP Values +INST3( movddup, "movddup" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEDBL(0x12)) // Replicate Double FP Values INST3( movsldup, "movsldup" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEFLT(0x12)) // Replicate even-indexed Single FP Values INST3( movshdup, "movshdup" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEFLT(0x16)) // Replicate odd-indexed Single FP Values +INST3( phminposuw, "phminposuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x41)) // Packed Horizontal Word Minimum +INST3( mpsadbw, "mpsadbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x42)) // Compute Multiple Packed Sums of Absolute Difference INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index 6ff82a78ee..e5a74fdf84 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -2344,6 +2344,13 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) } break; + case NI_SSE41_TestAllOnes: + { + info->internalFloatCount = 1; + info->setInternalCandidates(this, allSIMDRegs()); + break; + } + #ifdef _TARGET_X86_ case NI_SSE42_Crc32: { |