diff options
-rw-r--r-- | src/jit/codegenlinear.h | 9 | ||||
-rw-r--r-- | src/jit/codegenxarch.cpp | 10 | ||||
-rw-r--r-- | src/jit/compiler.cpp | 3 | ||||
-rw-r--r-- | src/jit/compiler.h | 10 | ||||
-rw-r--r-- | src/jit/compiler.hpp | 3 | ||||
-rw-r--r-- | src/jit/emitxarch.cpp | 60 | ||||
-rw-r--r-- | src/jit/emitxarch.h | 1 | ||||
-rw-r--r-- | src/jit/flowgraph.cpp | 9 | ||||
-rw-r--r-- | src/jit/gentree.cpp | 49 | ||||
-rw-r--r-- | src/jit/gentree.h | 6 | ||||
-rw-r--r-- | src/jit/gtlist.h | 4 | ||||
-rw-r--r-- | src/jit/gtstructs.h | 10 | ||||
-rw-r--r-- | src/jit/hwintrinsiccodegenxarch.cpp | 277 | ||||
-rw-r--r-- | src/jit/hwintrinsiclistxarch.h | 18 | ||||
-rw-r--r-- | src/jit/hwintrinsicxarch.cpp | 222 | ||||
-rw-r--r-- | src/jit/instrsxarch.h | 28 | ||||
-rw-r--r-- | src/jit/liveness.cpp | 3 | ||||
-rw-r--r-- | src/jit/lower.cpp | 3 | ||||
-rw-r--r-- | src/jit/lowerxarch.cpp | 25 | ||||
-rw-r--r-- | src/jit/lsraxarch.cpp | 42 | ||||
-rw-r--r-- | src/jit/morph.cpp | 3 | ||||
-rw-r--r-- | src/jit/namedintrinsiclist.h | 12 | ||||
-rw-r--r-- | src/jit/rangecheck.cpp | 6 | ||||
-rw-r--r-- | src/jit/stacklevelsetter.cpp | 3 | ||||
-rw-r--r-- | src/jit/valuenum.cpp | 3 |
25 files changed, 606 insertions, 213 deletions
diff --git a/src/jit/codegenlinear.h b/src/jit/codegenlinear.h index 7727d6da69..8818b9e6c8 100644 --- a/src/jit/codegenlinear.h +++ b/src/jit/codegenlinear.h @@ -117,13 +117,10 @@ void genPutArgStkSIMD12(GenTree* treeNode); #ifdef FEATURE_HW_INTRINSICS void genHWIntrinsic(GenTreeHWIntrinsic* node); #if defined(_TARGET_XARCH_) -void genHWIntrinsic_FullRangeImm8(GenTreeHWIntrinsic* node, instruction ins); void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins); void genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins); void genSSEIntrinsic(GenTreeHWIntrinsic* node); void genSSE2Intrinsic(GenTreeHWIntrinsic* node); -void genSSE3Intrinsic(GenTreeHWIntrinsic* node); -void genSSSE3Intrinsic(GenTreeHWIntrinsic* node); void genSSE41Intrinsic(GenTreeHWIntrinsic* node); void genSSE42Intrinsic(GenTreeHWIntrinsic* node); void genAVXIntrinsic(GenTreeHWIntrinsic* node); @@ -135,6 +132,12 @@ void genFMAIntrinsic(GenTreeHWIntrinsic* node); void genLZCNTIntrinsic(GenTreeHWIntrinsic* node); void genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node); void genPOPCNTIntrinsic(GenTreeHWIntrinsic* node); +template <typename HWIntrinsicSwitchCaseBody> +void genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic, + regNumber nonConstImmReg, + regNumber baseReg, + regNumber offsReg, + HWIntrinsicSwitchCaseBody emitSwCase); #endif // defined(_TARGET_XARCH_) #if defined(_TARGET_ARM64_) instruction getOpForHWIntrinsic(GenTreeHWIntrinsic* node, var_types instrType); diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index 7e0f688033..b5cb2c81ea 100644 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -1967,6 +1967,9 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS genRangeCheck(treeNode); break; @@ -3756,12 +3759,7 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* tree) // generate code for BoundsCheck nodes void CodeGen::genRangeCheck(GenTree* oper) { -#ifdef FEATURE_SIMD - noway_assert(oper->OperGet() == GT_ARR_BOUNDS_CHECK || oper->OperGet() == GT_SIMD_CHK); -#else // !FEATURE_SIMD - noway_assert(oper->OperGet() == GT_ARR_BOUNDS_CHECK); -#endif // !FEATURE_SIMD - + noway_assert(oper->OperIsBoundsCheck()); GenTreeBoundsChk* bndsChk = oper->AsBoundsChk(); GenTree* arrIndex = bndsChk->gtIndex; diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp index d24072835c..bbf8a9af53 100644 --- a/src/jit/compiler.cpp +++ b/src/jit/compiler.cpp @@ -7271,6 +7271,9 @@ void Compiler::CopyTestDataToCloneTree(GenTree* from, GenTree* to) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS CopyTestDataToCloneTree(from->gtBoundsChk.gtIndex, to->gtBoundsChk.gtIndex); CopyTestDataToCloneTree(from->gtBoundsChk.gtArrLen, to->gtBoundsChk.gtArrLen); return; diff --git a/src/jit/compiler.h b/src/jit/compiler.h index 72dafe3569..9f05442db0 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -3114,12 +3114,17 @@ protected: bool compSupportsHWIntrinsic(InstructionSet isa); bool isScalarISA(InstructionSet isa); static int ivalOfHWIntrinsic(NamedIntrinsic intrinsic); + unsigned simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig); static int numArgsOfHWIntrinsic(NamedIntrinsic intrinsic); + static GenTree* lastOpOfHWIntrinsic(GenTreeHWIntrinsic* node, int numArgs); static instruction insOfHWIntrinsic(NamedIntrinsic intrinsic, var_types type); static HWIntrinsicCategory categoryOfHWIntrinsic(NamedIntrinsic intrinsic); static HWIntrinsicFlag flagsOfHWIntrinsic(NamedIntrinsic intrinsic); GenTree* getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass); - GenTreeArgList* buildArgList(CORINFO_SIG_INFO* sig); + static int immUpperBoundOfHWIntrinsic(NamedIntrinsic intrinsic); + GenTree* impNonConstFallback(NamedIntrinsic intrinsic, var_types simdType, var_types baseType); + static bool isImmHWIntrinsic(NamedIntrinsic intrinsic, GenTree* lastOp); + GenTree* addRangeCheckIfNeeded(NamedIntrinsic intrinsic, GenTree* lastOp, bool mustExpand); #endif // _TARGET_XARCH_ #ifdef _TARGET_ARM64_ InstructionSet lookupHWIntrinsicISA(const char* className); @@ -10039,6 +10044,9 @@ public: #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS { GenTreeBoundsChk* const boundsChk = node->AsBoundsChk(); diff --git a/src/jit/compiler.hpp b/src/jit/compiler.hpp index 888ef7eaf9..0a1204eed7 100644 --- a/src/jit/compiler.hpp +++ b/src/jit/compiler.hpp @@ -4929,6 +4929,9 @@ void GenTree::VisitOperands(TVisitor visitor) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS { GenTreeBoundsChk* const boundsChk = this->AsBoundsChk(); if (visitor(boundsChk->gtIndex) == VisitResult::Abort) diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index 3a2f92b557..7e47ccfb7d 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -187,6 +187,14 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) case INS_psubusb: case INS_psubusw: case INS_psubw: + case INS_pslld: + case INS_psllq: + case INS_psllw: + case INS_psrld: + case INS_psrlq: + case INS_psrlw: + case INS_psrad: + case INS_psraw: case INS_punpckhbw: case INS_punpckhdq: case INS_punpckhqdq: @@ -209,6 +217,11 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) case INS_vinsertf128: case INS_vinserti128: case INS_vperm2i128: + case INS_vpsrlvd: + case INS_vpsrlvq: + case INS_vpsravd: + case INS_vpsllvd: + case INS_vpsllvq: case INS_xorpd: case INS_xorps: return IsAVXInstruction(ins); @@ -366,7 +379,7 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr) // size specification (128 vs. 256 bits) and the operand size specification (32 vs. 64 bits), where both are // required, the instruction must be created with the register size attribute (EA_16BYTE or EA_32BYTE), // and here we must special case these by the opcode. - if (ins == INS_vpermq) + if (ins == INS_vpermq || ins == INS_vpsrlvq || ins == INS_vpsllvq) { return true; } @@ -5479,6 +5492,50 @@ void emitter::emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg, } } +static bool isSseShift(instruction ins) +{ + switch (ins) + { + case INS_psrldq: + case INS_pslldq: + case INS_psrld: + case INS_psrlw: + case INS_psrlq: + case INS_pslld: + case INS_psllw: + case INS_psllq: + case INS_psrad: + case INS_psraw: + return true; + default: + return false; + } +} + +void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int ival) +{ + // TODO-XARCH refactoring emitIns_R_R_I to handle SSE2/AVX2 shift as well as emitIns_R_I + bool isShift = isSseShift(ins); + if (UseVEXEncoding() && !isShift) + { + emitIns_R_R_I(ins, attr, reg, reg1, ival); + } + else + { + if (reg1 != reg) + { + emitIns_R_R(INS_movaps, attr, reg, reg1); + } + // TODO-XARCH-BUG emitOutputRI cannot work with SSE2 shift instruction on imm8 > 127, so we replace it by the + // semantic alternatives. https://github.com/dotnet/coreclr/issues/16543 + if (isShift && ival > 127) + { + ival = 127; + } + emitIns_R_I(ins, attr, reg, ival); + } +} + void emitter::emitIns_SIMD_R_R_R_R( instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3) { @@ -10746,6 +10803,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) regOpcode = (regNumber)6; break; case INS_psrad: + case INS_psraw: regOpcode = (regNumber)4; break; default: diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h index 6acd835da4..a5bc303719 100644 --- a/src/jit/emitxarch.h +++ b/src/jit/emitxarch.h @@ -496,6 +496,7 @@ void emitIns_SIMD_R_R_C( instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs); void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs); void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2); +void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int ival); void emitIns_SIMD_R_R_R_R( instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3); #endif // FEATURE_HW_INTRINSICS diff --git a/src/jit/flowgraph.cpp b/src/jit/flowgraph.cpp index f3a0bf2ced..ccf70dcddf 100644 --- a/src/jit/flowgraph.cpp +++ b/src/jit/flowgraph.cpp @@ -9583,6 +9583,9 @@ void Compiler::fgSimpleLowering() #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS { // Add in a call to an error routine. fgSetRngChkTarget(tree, false); @@ -18796,6 +18799,9 @@ void Compiler::fgSetTreeSeqHelper(GenTree* tree, bool isLIR) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS // Evaluate the trees left to right fgSetTreeSeqHelper(tree->gtBoundsChk.gtIndex, isLIR); fgSetTreeSeqHelper(tree->gtBoundsChk.gtArrLen, isLIR); @@ -21426,6 +21432,9 @@ void Compiler::fgDebugCheckFlags(GenTree* tree) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS GenTreeBoundsChk* bndsChk; bndsChk = tree->AsBoundsChk(); diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp index 284b420120..9b8f88bd85 100644 --- a/src/jit/gentree.cpp +++ b/src/jit/gentree.cpp @@ -306,6 +306,10 @@ void GenTree::InitNodeSize() #ifdef FEATURE_SIMD GenTree::s_gtNodeSizes[GT_SIMD_CHK] = TREE_NODE_SZ_LARGE; #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + GenTree::s_gtNodeSizes[GT_HW_INTRINSIC_CHK] = TREE_NODE_SZ_LARGE; +#endif // FEATURE_HW_INTRINSICS + GenTree::s_gtNodeSizes[GT_ARR_ELEM] = TREE_NODE_SZ_LARGE; GenTree::s_gtNodeSizes[GT_ARR_INDEX] = TREE_NODE_SZ_LARGE; GenTree::s_gtNodeSizes[GT_ARR_OFFSET] = TREE_NODE_SZ_LARGE; @@ -1592,6 +1596,9 @@ AGAIN: #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS return Compare(op1->gtBoundsChk.gtIndex, op2->gtBoundsChk.gtIndex) && Compare(op1->gtBoundsChk.gtArrLen, op2->gtBoundsChk.gtArrLen) && (op1->gtBoundsChk.gtThrowKind == op2->gtBoundsChk.gtThrowKind); @@ -1818,6 +1825,9 @@ AGAIN: #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS if (gtHasRef(tree->gtBoundsChk.gtIndex, lclNum, defOnly)) { return true; @@ -2236,6 +2246,9 @@ AGAIN: #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS hash = genTreeHashAdd(hash, gtHashValue(tree->gtBoundsChk.gtIndex)); hash = genTreeHashAdd(hash, gtHashValue(tree->gtBoundsChk.gtArrLen)); hash = genTreeHashAdd(hash, tree->gtBoundsChk.gtThrowKind); @@ -2501,6 +2514,9 @@ AGAIN: #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS { if (!lvaLclVarRefsAccum(tree->gtBoundsChk.gtIndex, findPtr, refsPtr, &allVars, &trkdVars)) { @@ -4911,7 +4927,11 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) case GT_ARR_BOUNDS_CHECK: #ifdef FEATURE_SIMD case GT_SIMD_CHK: -#endif // FEATURE_SIMD +#endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS + costEx = 4; // cmp reg,reg and jae throw (not taken) costSz = 7; // jump to cold section @@ -5444,6 +5464,9 @@ GenTree** GenTree::gtGetChildPointer(GenTree* parent) const #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS if (this == parent->gtBoundsChk.gtIndex) { return &(parent->gtBoundsChk.gtIndex); @@ -5709,6 +5732,9 @@ bool GenTree::TryGetUse(GenTree* def, GenTree*** use) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS { GenTreeBoundsChk* const boundsChk = this->AsBoundsChk(); if (def == boundsChk->gtIndex) @@ -6048,6 +6074,9 @@ bool GenTree::OperMayThrow(Compiler* comp) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS case GT_INDEX_ADDR: return true; default: @@ -8192,6 +8221,9 @@ GenTree* Compiler::gtCloneExpr( #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS copy = new (this, oper) GenTreeBoundsChk(oper, tree->TypeGet(), gtCloneExpr(tree->gtBoundsChk.gtIndex, addFlags, deepVarNum, deepVarVal), @@ -8877,6 +8909,9 @@ unsigned GenTree::NumChildren() #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS return 2; case GT_FIELD: @@ -9007,6 +9042,9 @@ GenTree* GenTree::GetChild(unsigned childNum) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS switch (childNum) { case 0: @@ -9306,6 +9344,9 @@ GenTreeUseEdgeIterator::GenTreeUseEdgeIterator(GenTree* node) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS m_edge = &m_node->AsBoundsChk()->gtIndex; assert(*m_edge != nullptr); m_advance = &GenTreeUseEdgeIterator::AdvanceBoundsChk; @@ -11855,6 +11896,9 @@ void Compiler::gtDispTree(GenTree* tree, #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS gtDispVN(tree); printf("\n"); if (!topOnly) @@ -15804,6 +15848,9 @@ void Compiler::gtExtractSideEffList(GenTree* expr, #ifdef FEATURE_SIMD || expr->OperGet() == GT_SIMD_CHK #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + || expr->OperGet() == GT_HW_INTRINSIC_CHK +#endif // FEATURE_HW_INTRINSICS ) { gtExtractSideEffList(expr->AsBoundsChk()->gtIndex, pList, flags); diff --git a/src/jit/gentree.h b/src/jit/gentree.h index e246c46bc3..25d5a286fd 100644 --- a/src/jit/gentree.h +++ b/src/jit/gentree.h @@ -1661,6 +1661,12 @@ public: return true; } #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + if (op == GT_HW_INTRINSIC_CHK) + { + return true; + } +#endif // FEATURE_HW_INTRINSICS return false; } diff --git a/src/jit/gtlist.h b/src/jit/gtlist.h index fde7aa370e..13b8fb6e71 100644 --- a/src/jit/gtlist.h +++ b/src/jit/gtlist.h @@ -97,6 +97,10 @@ GTNODE(SIMD_CHK , GenTreeBoundsChk ,0,GTK_SPECIAL|GTK_NOVALUE)// Compa // does the compare, so that it can be more easily optimized. But that involves generating qmarks at import time... #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS +GTNODE(HW_INTRINSIC_CHK , GenTreeBoundsChk ,0,GTK_SPECIAL|GTK_NOVALUE)// Compare whether an imm8 argument is in the valid range, and throw ArgumentOutOfRangeException if not. +#endif + GTNODE(ALLOCOBJ , GenTreeAllocObj ,0,GTK_UNOP|GTK_EXOP) // object allocator GTNODE(INIT_VAL , GenTreeOp ,0,GTK_UNOP) // Initialization value for an initBlk diff --git a/src/jit/gtstructs.h b/src/jit/gtstructs.h index 6cfb92c98e..0df4e35078 100644 --- a/src/jit/gtstructs.h +++ b/src/jit/gtstructs.h @@ -85,11 +85,15 @@ GTSTRUCT_1(FptrVal , GT_FTN_ADDR) GTSTRUCT_1(Intrinsic , GT_INTRINSIC) GTSTRUCT_1(Index , GT_INDEX) GTSTRUCT_1(IndexAddr , GT_INDEX_ADDR) -#ifdef FEATURE_SIMD +#if defined(FEATURE_HW_INTRINSICS) && defined(FEATURE_SIMD) +GTSTRUCT_3(BoundsChk , GT_ARR_BOUNDS_CHECK, GT_SIMD_CHK, GT_HW_INTRINSIC_CHK) +#elif defined(FEATURE_SIMD) GTSTRUCT_2(BoundsChk , GT_ARR_BOUNDS_CHECK, GT_SIMD_CHK) -#else // !FEATURE_SIMD +#elif defined(FEATURE_HW_INTRINSICS) +GTSTRUCT_2(BoundsChk , GT_ARR_BOUNDS_CHECK, GT_HW_INTRINSIC_CHK) +#else // !FEATURE_SIMD && !FEATURE_HW_INTRINSICS GTSTRUCT_1(BoundsChk , GT_ARR_BOUNDS_CHECK) -#endif // !FEATURE_SIMD +#endif // !FEATURE_SIMD && !FEATURE_HW_INTRINSICS GTSTRUCT_1(ArrLen , GT_ARR_LENGTH) GTSTRUCT_1(ArrElem , GT_ARR_ELEM) GTSTRUCT_1(ArrOffs , GT_ARR_OFFSET) diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp index a4fc9f1875..dbc2e35fcb 100644 --- a/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/jit/hwintrinsiccodegenxarch.cpp @@ -37,8 +37,8 @@ static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsi { // TODO - make more categories to the table-driven framework // HW_Category_Helper and HW_Flag_MultiIns usually need manual codegen - const bool tableDrivenCategory = category == HW_Category_SimpleSIMD || category == HW_Category_MemoryLoad || - category == HW_Category_MemoryStore || category == HW_Category_SIMDScalar; + const bool tableDrivenCategory = + category != HW_Category_Special && category != HW_Category_Scalar && category != HW_Category_Helper; const bool tableDrivenFlag = (flags & HW_Flag_MultiIns) == 0; return tableDrivenCategory && tableDrivenFlag; } @@ -75,7 +75,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) assert(numArgs >= 0); instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType); assert(ins != INS_invalid); - emitAttr simdSize = (emitAttr)(node->gtSIMDSize); + emitAttr simdSize = EA_ATTR(node->gtSIMDSize); assert(simdSize != 0); switch (numArgs) @@ -99,9 +99,11 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case 2: genConsumeOperands(node); + op1Reg = op1->gtRegNum; + op2Reg = op2->gtRegNum; if (category == HW_Category_MemoryStore) { - emit->emitIns_AR_R(ins, simdSize, op2->gtRegNum, op1->gtRegNum, 0); + emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0); } else if ((ival != -1) && varTypeIsFloating(baseType)) { @@ -109,7 +111,28 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) } else if (category == HW_Category_MemoryLoad) { - emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1->gtRegNum, op2->gtRegNum); + emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg); + } + else if (Compiler::isImmHWIntrinsic(intrinsicID, op2)) + { + auto emitSwCase = [&](unsigned i) { + emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, (int)i); + }; + + if (op2->IsCnsIntOrI()) + { + ssize_t ival = op2->AsIntCon()->IconValue(); + emitSwCase((unsigned)ival); + } + else + { + // We emit a fallback case for the scenario when the imm-op is not a constant. This should + // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it + // can also occur if the consumer calls it directly and just doesn't pass a constant value. + regNumber baseReg = node->ExtractTempReg(); + regNumber offsReg = node->GetSingleTempReg(); + genHWIntrinsicJumpTableFallback(intrinsicID, op2Reg, baseReg, offsReg, emitSwCase); + } } else { @@ -137,7 +160,30 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) genConsumeRegs(op3); regNumber op3Reg = op3->gtRegNum; - emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg); + if (Compiler::isImmHWIntrinsic(intrinsicID, op3)) + { + auto emitSwCase = [&](unsigned i) { + emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, (int)i); + }; + if (op3->IsCnsIntOrI()) + { + ssize_t ival = op3->AsIntCon()->IconValue(); + emitSwCase((unsigned)ival); + } + else + { + // We emit a fallback case for the scenario when the imm-op is not a constant. This should + // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it + // can also occur if the consumer calls it directly and just doesn't pass a constant value. + regNumber baseReg = node->ExtractTempReg(); + regNumber offsReg = node->GetSingleTempReg(); + genHWIntrinsicJumpTableFallback(intrinsicID, op3Reg, baseReg, offsReg, emitSwCase); + } + } + else + { + emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg); + } break; } @@ -157,12 +203,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case InstructionSet_SSE2: genSSE2Intrinsic(node); break; - case InstructionSet_SSE3: - genSSE3Intrinsic(node); - break; - case InstructionSet_SSSE3: - genSSSE3Intrinsic(node); - break; case InstructionSet_SSE41: genSSE41Intrinsic(node); break; @@ -202,100 +242,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) } } -void CodeGen::genHWIntrinsic_FullRangeImm8(GenTreeHWIntrinsic* node, instruction ins) -{ - var_types targetType = node->TypeGet(); - regNumber targetReg = node->gtRegNum; - GenTree* op1 = node->gtGetOp1(); - regNumber op1Reg = REG_NA; - GenTree* op2 = node->gtGetOp2(); - regNumber op2Reg = REG_NA; - GenTree* op3 = nullptr; - emitAttr simdSize = (emitAttr)(node->gtSIMDSize); - emitter* emit = getEmitter(); - - GenTreeArgList* argList; - - assert(op1->OperIsList()); - assert(op1->AsArgList()->Rest() != nullptr); - assert(op1->AsArgList()->Rest()->Rest() != nullptr); - assert(op1->AsArgList()->Rest()->Rest()->Rest() == nullptr); - assert(op2 == nullptr); - - argList = op1->AsArgList(); - op1 = argList->Current(); - op1Reg = op1->gtRegNum; - genConsumeRegs(op1); - - argList = argList->Rest(); - op2 = argList->Current(); - op2Reg = op2->gtRegNum; - genConsumeRegs(op2); - - argList = argList->Rest(); - op3 = argList->Current(); - genConsumeRegs(op3); - - if (op3->IsCnsIntOrI()) - { - ssize_t ival = op3->AsIntConCommon()->IconValue(); - emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, (int)ival); - } - else - { - // We emit a fallback case for the scenario when op3 is not a constant. This should normally - // only happen when the intrinsic is called indirectly, such as via Reflection. However, it can - // also occur if the consumer calls it directly and just doesn't pass a constant value. - - const unsigned jmpCount = 256; - BasicBlock* jmpTable[jmpCount]; - - unsigned jmpTableBase = emit->emitBBTableDataGenBeg(jmpCount, true); - unsigned jmpTableOffs = 0; - - // Emit the jump table - - JITDUMP("\n J_M%03u_DS%02u LABEL DWORD\n", Compiler::s_compMethodsCount, jmpTableBase); - - for (unsigned i = 0; i < jmpCount; i++) - { - jmpTable[i] = genCreateTempLabel(); - JITDUMP(" DD L_M%03u_BB%02u\n", Compiler::s_compMethodsCount, jmpTable[i]->bbNum); - emit->emitDataGenData(i, jmpTable[i]); - } - - emit->emitDataGenEnd(); - - // Compute and jump to the appropriate offset in the switch table - - regNumber baseReg = node->ExtractTempReg(); // the start of the switch table - regNumber offsReg = node->GetSingleTempReg(); // the offset into the switch table - - emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0); - - emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, op3->gtRegNum, 4, 0); - emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg); - emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg); - emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg); - - // Emit the switch table entries - - BasicBlock* switchTableBeg = genCreateTempLabel(); - BasicBlock* switchTableEnd = genCreateTempLabel(); - - genDefineTempLabel(switchTableBeg); - - for (unsigned i = 0; i < jmpCount; i++) - { - genDefineTempLabel(jmpTable[i]); - emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, i); - emit->emitIns_J(INS_jmp, switchTableEnd); - } - - genDefineTempLabel(switchTableEnd); - } -} - //------------------------------------------------------------------------ // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a // register/memory operand, and that returns a value in register @@ -310,7 +256,7 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins) regNumber targetReg = node->gtRegNum; GenTree* op1 = node->gtGetOp1(); GenTree* op2 = node->gtGetOp2(); - emitAttr simdSize = (emitAttr)(node->gtSIMDSize); + emitAttr simdSize = EA_ATTR(node->gtSIMDSize); emitter* emit = getEmitter(); // TODO-XArch-CQ: Commutative operations can have op1 be contained @@ -433,7 +379,7 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins) regNumber targetReg = node->gtRegNum; GenTree* op1 = node->gtGetOp1(); GenTree* op2 = node->gtGetOp2(); - emitAttr simdSize = (emitAttr)(node->gtSIMDSize); + emitAttr simdSize = EA_ATTR(node->gtSIMDSize); int ival = Compiler::ivalOfHWIntrinsic(node->gtHWIntrinsicId); emitter* emit = getEmitter(); @@ -544,6 +490,74 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins) } } +// genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics +// with non-constant argument +// +// Arguments: +// intrinsic - intrinsic ID +// nonConstImmReg - the register contains non-constant imm8 argument +// baseReg - a register for the start of the switch table +// offsReg - a register for the offset into the switch table +// emitSwCase - the lambda to generate siwtch-case +// +// Return Value: +// generate the jump-table fallback for imm-intrinsics with non-constant argument. +// Note: +// This function can be used for all imm-intrinsics (whether full-range or not), +// The compiler front-end (i.e. importer) is responsible to insert a range-check IR +// (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check. +// +template <typename HWIntrinsicSwitchCaseBody> +void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic, + regNumber nonConstImmReg, + regNumber baseReg, + regNumber offsReg, + HWIntrinsicSwitchCaseBody emitSwCase) +{ + assert(nonConstImmReg != REG_NA); + emitter* emit = getEmitter(); + + const unsigned maxByte = (unsigned)Compiler::immUpperBoundOfHWIntrinsic(intrinsic) + 1; + assert(maxByte <= 256); + BasicBlock* jmpTable[256]; + + unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true); + unsigned jmpTableOffs = 0; + + // Emit the jump table + for (unsigned i = 0; i < maxByte; i++) + { + jmpTable[i] = genCreateTempLabel(); + emit->emitDataGenData(i, jmpTable[i]); + } + + emit->emitDataGenEnd(); + + // Compute and jump to the appropriate offset in the switch table + emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0); + + emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0); + emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg); + emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg); + emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg); + + // Emit the switch table entries + + BasicBlock* switchTableBeg = genCreateTempLabel(); + BasicBlock* switchTableEnd = genCreateTempLabel(); + + genDefineTempLabel(switchTableBeg); + + for (unsigned i = 0; i < maxByte; i++) + { + genDefineTempLabel(jmpTable[i]); + emitSwCase(i); + emit->emitIns_J(INS_jmp, switchTableEnd); + } + + genDefineTempLabel(switchTableEnd); +} + //------------------------------------------------------------------------ // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node // @@ -752,10 +766,6 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_SSE_Shuffle: - genHWIntrinsic_FullRangeImm8(node, INS_shufps); - break; - case NI_SSE_StoreFence: { assert(baseType == TYP_VOID); @@ -1009,25 +1019,6 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node) genProduceReg(node); } -void CodeGen::genSSE3Intrinsic(GenTreeHWIntrinsic* node) -{ - NYI("Implement SSE3 intrinsic code generation"); -} - -void CodeGen::genSSSE3Intrinsic(GenTreeHWIntrinsic* node) -{ - if (node->gtHWIntrinsicId == NI_SSSE3_AlignRight) - { - genHWIntrinsic_FullRangeImm8(node, INS_palignr); - } - else - { - unreached(); - } - - genProduceReg(node); -} - //------------------------------------------------------------------------ // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node // @@ -1097,12 +1088,34 @@ void CodeGen::genAVXIntrinsic(GenTreeHWIntrinsic* node) { NamedIntrinsic intrinsicID = node->gtHWIntrinsicId; var_types baseType = node->gtSIMDBaseType; - instruction ins = INS_invalid; + emitAttr attr = EA_ATTR(node->gtSIMDSize); + var_types targetType = node->TypeGet(); + instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType); + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + regNumber targetReg = node->gtRegNum; + emitter* emit = getEmitter(); genConsumeOperands(node); switch (intrinsicID) { + case NI_AVX_SetZeroVector256: + { + assert(op1 == nullptr); + assert(op2 == nullptr); + // SetZeroVector256 will generate pxor with integral base-typ, but pxor is a AVX2 instruction, so we + // generate xorps on AVX machines. + if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsIntegral(baseType)) + { + emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); + } + else + { + emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg); + } + break; + } default: unreached(); break; diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index b5e30ea4ee..0c28863f17 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -10,7 +10,7 @@ // clang-format off -#if FEATURE_HW_INTRINSICS +#ifdef FEATURE_HW_INTRINSICS /* Note 1) Each hardware intrinsic has a unique Intrinsic ID with type of `enum NamedIntrinsic` 2) All the overloads of an intrinsic in an ISA class share one Intrinsic ID @@ -209,6 +209,11 @@ HARDWARE_INTRINSIC(SSE2_PackSignedSaturate, "PackSigned HARDWARE_INTRINSIC(SSE2_PackUnsignedSaturate, "PackUnsignedSaturate", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_packuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromArg) HARDWARE_INTRINSIC(SSE2_SetZeroVector128, "SetZeroVector128", SSE2, -1, 16, 0, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_Helper, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2_SumAbsoluteDifferences, "SumAbsoluteDifferences", SSE2, -1, 16, 2, {INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromArg) +HARDWARE_INTRINSIC(SSE2_ShiftLeftLogical, "ShiftLeftLogical", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_psllw, INS_psllw, INS_pslld, INS_pslld, INS_psllq, INS_psllq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE2_ShiftLeftLogical128BitLane, "ShiftLeftLogical128BitLane", SSE2, -1, 16, 2, {INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE2_ShiftRightArithmetic, "ShiftRightArithmetic", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_psraw, INS_invalid, INS_psrad, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE2_ShiftRightLogical, "ShiftRightLogical", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_psrlw, INS_psrlw, INS_psrld, INS_psrld, INS_psrlq, INS_psrlq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE2_ShiftRightLogical128BitLane, "ShiftRightLogical128BitLane", SSE2, -1, 16, 2, {INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE2_Sqrt, "Sqrt", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2_SqrtScalar, "SqrtScalar", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2_Store, "Store", SSE2, -1, 16, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoFlag) @@ -279,6 +284,8 @@ HARDWARE_INTRINSIC(AVX_And, "And", HARDWARE_INTRINSIC(AVX_AndNot, "AndNot", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_AddSubtract, "AddSubtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_BlendVariable, "BlendVariable", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_Compare, "Compare", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_CompareScalar, "CompareScalar", AVX, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX_ConvertToSingle, "ConvertToSingle", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_Divide, "Divide", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_divpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_DuplicateEvenIndexed, "DuplicateEvenIndexed", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -294,6 +301,7 @@ HARDWARE_INTRINSIC(AVX_Multiply, "Multiply", HARDWARE_INTRINSIC(AVX_Or, "Or", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX_Reciprocal, "Reciprocal", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_ReciprocalSqrt, "ReciprocalSqrt", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_SetZeroVector256, "SetZeroVector256", AVX, -1, 32, 0, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_xorps, INS_xorpd}, HW_Category_Helper, HW_Flag_OneTypeGeneric) HARDWARE_INTRINSIC(AVX_Sqrt, "Sqrt", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_Store, "Store", AVX, -1, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX_StoreAligned, "StoreAligned", AVX, -1, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoFlag) @@ -321,11 +329,19 @@ HARDWARE_INTRINSIC(AVX2_HorizontalSubtractSaturate, "Horizontal HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal, "LoadAlignedVector256NonTemporal", AVX2, -1, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_Multiply, "Multiply", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2_Or, "Or", AVX2, -1, 32, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX2_ShiftLeftLogical, "ShiftLeftLogical", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_psllw, INS_psllw, INS_pslld, INS_pslld, INS_psllq, INS_psllq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2_ShiftLeftLogical128BitLane, "ShiftLeftLogical128BitLane", AVX2, -1, 32, 2, {INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2_ShiftLeftLogicalVariable, "ShiftLeftLogicalVariable", AVX2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpsllvd, INS_vpsllvd, INS_vpsllvq, INS_vpsllvq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_UnfixedSIMDSize|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX2_ShiftRightArithmetic, "ShiftRightArithmetic", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_psraw, INS_invalid, INS_psrad, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2_ShiftRightLogical, "ShiftRightLogical", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_psrlw, INS_psrlw, INS_psrld, INS_psrld, INS_psrlq, INS_psrlq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2_ShiftRightLogical128BitLane, "ShiftRightLogical128BitLane", AVX2, -1, 32, 2, {INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2_ShiftRightLogicalVariable, "ShiftRightLogicalVariable", AVX2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpsrlvd, INS_vpsrlvd, INS_vpsrlvq, INS_vpsrlvq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_UnfixedSIMDSize|HW_Flag_NoContainment) HARDWARE_INTRINSIC(AVX2_Subtract, "Subtract", AVX2, -1, 32, 2, {INS_psubb, INS_psubb, INS_psubw, INS_psubw, INS_psubd, INS_psubd, INS_psubq, INS_psubq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_SubtractSaturate, "SubtractSaturate", AVX2, -1, 32, 2, {INS_psubsb, INS_psubusb, INS_psubsw, INS_psubusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_UnpackHigh, "UnpackHigh", AVX2, -1, 32, 2, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq,INS_punpckhqdq,INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_UnpackLow, "UnpackLow", AVX2, -1, 32, 2, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq,INS_punpcklqdq,INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2_Xor, "Xor", AVX2, -1, 32, 2, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) + // AES Intrinsics HARDWARE_INTRINSIC(AES_IsSupported, "get_IsSupported", AES, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp index 3b6ef766b4..4ab3bf8285 100644 --- a/src/jit/hwintrinsicxarch.cpp +++ b/src/jit/hwintrinsicxarch.cpp @@ -4,7 +4,7 @@ #include "jitpch.h" -#if FEATURE_HW_INTRINSICS +#ifdef FEATURE_HW_INTRINSICS struct HWIntrinsicInfo { @@ -185,15 +185,21 @@ int Compiler::ivalOfHWIntrinsic(NamedIntrinsic intrinsic) // Return Value: // the SIMD size of this intrinsic // - from the hwIntrinsicInfoArray table if intrinsic has NO HW_Flag_UnfixedSIMDSize -// - TODO-XArch-NYI - from the signature if intrinsic has HW_Flag_UnfixedSIMDSize +// - from the signature if intrinsic has HW_Flag_UnfixedSIMDSize // // Note - this function is only used by the importer // after importation (i.e., codegen), we can get the SIMD size from GenTreeHWIntrinsic IR -static unsigned simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig) +unsigned Compiler::simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig) { assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); - assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flags & HW_Flag_UnfixedSIMDSize) == 0); - return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize; + if ((Compiler::flagsOfHWIntrinsic(intrinsic) & HW_Flag_UnfixedSIMDSize) == 0) + { + return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize; + } + + int simdSize = getSIMDTypeSizeInBytes(sig->retTypeSigClass); + assert(simdSize > 0); + return (unsigned)simdSize; } //------------------------------------------------------------------------ @@ -213,6 +219,41 @@ int Compiler::numArgsOfHWIntrinsic(NamedIntrinsic intrinsic) } //------------------------------------------------------------------------ +// lastOpOfHWIntrinsic: get the last operand of a HW intrinsic +// +// Arguments: +// node -- the intrinsic node. +// numArgs-- number of argument +// +// Return Value: +// number of arguments +// +GenTree* Compiler::lastOpOfHWIntrinsic(GenTreeHWIntrinsic* node, int numArgs) +{ + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + switch (numArgs) + { + case 0: + return nullptr; + case 1: + assert(op1 != nullptr); + return op1; + case 2: + assert(op2 != nullptr); + return op2; + case 3: + assert(op1->OperIsList()); + assert(op1->AsArgList()->Rest()->Rest()->Current() != nullptr); + assert(op1->AsArgList()->Rest()->Rest()->Rest() == nullptr); + return op1->AsArgList()->Rest()->Rest()->Current(); + default: + unreached(); + return nullptr; + } +} + +//------------------------------------------------------------------------ // insOfHWIntrinsic: get the instruction of the given intrinsic // // Arguments: @@ -281,9 +322,9 @@ GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE unsigned int argSizeBytes; var_types base = getBaseTypeAndSizeOfSIMDType(argClass, &argSizeBytes); argType = getSIMDTypeForSize(argSizeBytes); - assert(argType == TYP_SIMD32 || argType == TYP_SIMD16); + assert((argType == TYP_SIMD32) || (argType == TYP_SIMD16)); arg = impSIMDPopStack(argType); - assert(arg->TypeGet() == TYP_SIMD16 || arg->TypeGet() == TYP_SIMD32); + assert((arg->TypeGet() == TYP_SIMD16) || (arg->TypeGet() == TYP_SIMD32)); } else { @@ -296,6 +337,136 @@ GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE } //------------------------------------------------------------------------ +// immUpperBoundOfHWIntrinsic: get the max imm-value of non-full-range IMM intrinsic +// +// Arguments: +// intrinsic -- intrinsic ID +// +// Return Value: +// the max imm-value of non-full-range IMM intrinsic +// +int Compiler::immUpperBoundOfHWIntrinsic(NamedIntrinsic intrinsic) +{ + assert(categoryOfHWIntrinsic(intrinsic) == HW_Category_IMM); + switch (intrinsic) + { + case NI_AVX_Compare: + case NI_AVX_CompareScalar: + return 31; // enum FloatComparisonMode has 32 values + + default: + assert((flagsOfHWIntrinsic(intrinsic) & HW_Flag_FullRangeIMM) != 0); + return 255; + } +} + +//------------------------------------------------------------------------ +// impNonConstFallback: convert certain SSE2/AVX2 shift intrinsic to its semantic alternative when the imm-arg is +// not a compile-time constant +// +// Arguments: +// intrinsic -- intrinsic ID +// simdType -- Vector type +// baseType -- base type of the Vector128/256<T> +// +// Return Value: +// return the IR of semantic alternative on non-const imm-arg +// +GenTree* Compiler::impNonConstFallback(NamedIntrinsic intrinsic, var_types simdType, var_types baseType) +{ + assert((flagsOfHWIntrinsic(intrinsic) & HW_Flag_NoJmpTableIMM) != 0); + switch (intrinsic) + { + case NI_SSE2_ShiftLeftLogical: + case NI_SSE2_ShiftRightArithmetic: + case NI_SSE2_ShiftRightLogical: + case NI_AVX2_ShiftLeftLogical: + case NI_AVX2_ShiftRightArithmetic: + case NI_AVX2_ShiftRightLogical: + { + GenTree* op2 = impPopStack().val; + GenTree* op1 = impSIMDPopStack(simdType); + GenTree* tmpOp = + gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_SSE2_ConvertScalarToVector128Int32, TYP_INT, 16); + return gtNewSimdHWIntrinsicNode(simdType, op1, tmpOp, intrinsic, baseType, genTypeSize(simdType)); + } + + default: + unreached(); + return nullptr; + } +} + +//------------------------------------------------------------------------ +// isImmHWIntrinsic: check the intrinsic is a imm-intrinsic overload or not +// +// Arguments: +// intrinsic -- intrinsic ID +// lastOp -- the last operand of the intrinsic that may point to the imm-arg +// +// Return Value: +// Return true iff the intrinsics is an imm-intrinsic overload. +// Note: that some intrinsics, with HW_Flag_MaybeIMM set, have both imm (integer immediate) and vector (i.e. +// non-TYP_INT) overloads. +// +bool Compiler::isImmHWIntrinsic(NamedIntrinsic intrinsic, GenTree* lastOp) +{ + if (categoryOfHWIntrinsic(intrinsic) != HW_Category_IMM) + { + return false; + } + + if ((flagsOfHWIntrinsic(intrinsic) & HW_Flag_MaybeIMM) != 0 && genActualType(lastOp->TypeGet()) != TYP_INT) + { + return false; + } + + return true; +} + +//------------------------------------------------------------------------ +// addRangeCheckIfNeeded: add a GT_HW_INTRINSIC_CHK node for non-full-range imm-intrinsic +// +// Arguments: +// intrinsic -- intrinsic ID +// lastOp -- the last operand of the intrinsic that points to the imm-arg +// mustExpand -- true if the compiler is compiling the fallback(GT_CALL) of this intrinsics +// +// Return Value: +// add a GT_HW_INTRINSIC_CHK node for non-full-range imm-intrinsic, which would throw ArgumentOutOfRangeException +// when the imm-argument is not in the valid range +// +GenTree* Compiler::addRangeCheckIfNeeded(NamedIntrinsic intrinsic, GenTree* lastOp, bool mustExpand) +{ + assert(lastOp != nullptr); + // Full-range imm-intrinsics do not need the range-check + // because the imm-parameter of the intrinsic method is a byte. + if (mustExpand && ((flagsOfHWIntrinsic(intrinsic) & HW_Flag_FullRangeIMM) == 0) && + isImmHWIntrinsic(intrinsic, lastOp)) + { + assert(!lastOp->IsCnsIntOrI()); + GenTree* upperBoundNode = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, immUpperBoundOfHWIntrinsic(intrinsic)); + GenTree* index = nullptr; + if ((lastOp->gtFlags & GTF_SIDE_EFFECT) != 0) + { + index = fgInsertCommaFormTemp(&lastOp); + } + else + { + index = gtCloneExpr(lastOp); + } + GenTreeBoundsChk* hwIntrinsicChk = new (this, GT_HW_INTRINSIC_CHK) + GenTreeBoundsChk(GT_HW_INTRINSIC_CHK, TYP_VOID, index, upperBoundNode, SCK_RNGCHK_FAIL); + hwIntrinsicChk->gtThrowKind = SCK_ARG_RNG_EXCPN; + return gtNewOperNode(GT_COMMA, lastOp->TypeGet(), hwIntrinsicChk, lastOp); + } + else + { + return lastOp; + } +} + +//------------------------------------------------------------------------ // isFullyImplmentedISAClass: return true if all the hardware intrinsics // of this ISA are implemented in RyuJIT. // @@ -445,16 +616,35 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, { return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand); } - else if (category == HW_Category_IMM) + // Avoid checking stacktop for 0-op intrinsics + if (sig->numArgs > 0 && isImmHWIntrinsic(intrinsic, impStackTop().val)) { GenTree* lastOp = impStackTop().val; - if (!lastOp->IsCnsIntOrI() && !mustExpand) + // The imm-HWintrinsics that do not accept all imm8 values may throw + // ArgumentOutOfRangeException when the imm argument is not in the valid range + if ((flags & HW_Flag_FullRangeIMM) == 0) { - // When the imm-argument is not a constant and we are not being forced to expand, we need to - // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The - // intrinsic method is recursive and will be forced to expand, at which point - // we emit some less efficient fallback code. - return nullptr; + if (!mustExpand && lastOp->IsCnsIntOrI() && + lastOp->AsIntCon()->IconValue() > immUpperBoundOfHWIntrinsic(intrinsic)) + { + return nullptr; + } + } + + if (!lastOp->IsCnsIntOrI()) + { + if ((flags & HW_Flag_NoJmpTableIMM) == 0 && !mustExpand) + { + // When the imm-argument is not a constant and we are not being forced to expand, we need to + // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The + // intrinsic method is recursive and will be forced to expand, at which point + // we emit some less efficient fallback code. + return nullptr; + } + else if ((flags & HW_Flag_NoJmpTableIMM) != 0) + { + return impNonConstFallback(intrinsic, retType, baseType); + } } } @@ -537,6 +727,8 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, strip(info.compCompHnd->getArgType(sig, info.compCompHnd->getArgNext(argList), &argClass))); op2 = getArgForHWIntrinsic(argType, argClass); + op2 = addRangeCheckIfNeeded(intrinsic, op2, mustExpand); + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); op1 = getArgForHWIntrinsic(argType, argClass); @@ -551,6 +743,8 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg3, &argClass))); GenTree* op3 = getArgForHWIntrinsic(argType, argClass); + op3 = addRangeCheckIfNeeded(intrinsic, op3, mustExpand); + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass))); op2 = getArgForHWIntrinsic(argType, argClass); diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index b23b44539f..3d314a699f 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -347,14 +347,14 @@ INST3( psubusw, "psubusw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, // which is handled in emitxarch.cpp. INST3( psrldq, "psrldq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Shift right logical of xmm reg by given number of bytes INST3( pslldq, "pslldq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Shift left logical of xmm reg by given number of bytes -INST3( psllq, "psllq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Packed shift left logical of 64-bit integers -INST3( psrlq, "psrlq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Packed shift right logical of 64-bit integers -INST3( pslld, "pslld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift left logical of 32-bit integers -INST3( psrld, "psrld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift right logical of 32-bit integers -INST3( psllw, "psllw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift left logical of 16-bit integers -INST3( psrlw, "psrlw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift right logical of 16-bit integers -INST3( psrad, "psrad" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift right arithmetic of 32-bit integers -INST3( psraw, "psraw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift right arithmetic of 16-bit integers +INST3( psllw, "psllw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), PCKDBL(0xF1)) // Packed shift left logical of 16-bit integers +INST3( pslld, "pslld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), PCKDBL(0xF2)) // Packed shift left logical of 32-bit integers +INST3( psllq, "psllq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), PCKDBL(0xF3)) // Packed shift left logical of 64-bit integers +INST3( psrlw, "psrlw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), PCKDBL(0xD1)) // Packed shift right logical of 16-bit integers +INST3( psrld, "psrld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), PCKDBL(0xD2)) // Packed shift right logical of 32-bit integers +INST3( psrlq, "psrlq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), PCKDBL(0xD3)) // Packed shift right logical of 64-bit integers +INST3( psraw, "psraw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), PCKDBL(0xE1)) // Packed shift right arithmetic of 16-bit integers +INST3( psrad, "psrad" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), PCKDBL(0xE2)) // Packed shift right arithmetic of 32-bit integers INST3( pmaxub, "pmaxub" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDE)) // packed maximum unsigned bytes INST3( pminub, "pminub" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDA)) // packed minimum unsigned bytes @@ -455,9 +455,15 @@ INST3( vinserti128, "inserti128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( vzeroupper, "zeroupper" , 0, IUM_WR, 0, 0, 0xC577F8, BAD_CODE, BAD_CODE) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) INST3( vperm2i128, "perm2i128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x46)) // Permute 128-bit halves of input register INST3( vpermq, "permq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x00)) // Permute 64-bit of input register -INST3( vblendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4A)) // Variable Blend Packed Singles -INST3( vblendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4B)) // Variable Blend Packed Doubles -INST3( vpblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4C)) // Variable Blend Packed Bytes +INST3( vblendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4A)) // Variable Blend Packed Singles +INST3( vblendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4B)) // Variable Blend Packed Doubles +INST3( vpblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4C)) // Variable Blend Packed Bytes + +INST3( vpsrlvd, "psrlvd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x45)) // Variable Bit Shift Right Logical +INST3( vpsrlvq, "psrlvq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x45)) // Variable Bit Shift Right Logical +INST3( vpsravd, "psravd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x46)) // Variable Bit Shift Right Arithmetic +INST3( vpsllvd, "psllvd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x47)) // Variable Bit Shift Left Logical +INST3( vpsllvq, "psllvq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x47)) // Variable Bit Shift Left Logical INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) // Scalar instructions in SSE4.2 diff --git a/src/jit/liveness.cpp b/src/jit/liveness.cpp index 6793bf545f..dac5a00e1b 100644 --- a/src/jit/liveness.cpp +++ b/src/jit/liveness.cpp @@ -2181,6 +2181,9 @@ void Compiler::fgComputeLifeLIR(VARSET_TP& life, BasicBlock* block, VARSET_VALAR #if defined(FEATURE_SIMD) case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS case GT_JCMP: case GT_CMP: case GT_JCC: diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index 28258b1faf..4459c45d99 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -204,6 +204,9 @@ GenTree* Lowering::LowerNode(GenTree* node) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS ContainCheckBoundsChk(node->AsBoundsChk()); break; #endif // _TARGET_XARCH_ diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index 8eb9e164cf..3f1deb825f 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2412,27 +2412,18 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } } - else if (numArgs == 3) + + if (Compiler::categoryOfHWIntrinsic(intrinsicID) == HW_Category_IMM) { - switch (category) + assert(numArgs >= 2); + GenTree* lastOp = Compiler::lastOpOfHWIntrinsic(node, numArgs); + assert(lastOp != nullptr); + if (Compiler::isImmHWIntrinsic(intrinsicID, lastOp)) { - case HW_Category_IMM: + if (lastOp->IsCnsIntOrI()) { - assert(op1->OperIsList()); - GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current(); - - if (op3->IsCnsIntOrI()) - { - MakeSrcContained(node, op3); - } - break; + MakeSrcContained(node, lastOp); } - - default: - // TODO-XArch-CQ: Assert that this is unreached after we have ensured the relevant node types are - // handled. - // https://github.com/dotnet/coreclr/issues/16497 - break; } } } diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index 365a658257..6ff82a78ee 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -449,6 +449,9 @@ void LinearScan::BuildNode(GenTree* tree) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS // Consumes arrLen & index - has no result info->srcCount = 2; assert(info->dstCount == 0); @@ -2253,6 +2256,7 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) TreeNodeInfo* info = currentNodeInfo; NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId; InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID); + int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID); if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2) { SetContainsAVXFlags(true, 32); @@ -2281,6 +2285,24 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) info->srcCount += GetOperandInfo(op2); } + if (Compiler::categoryOfHWIntrinsic(intrinsicID) == HW_Category_IMM && + (Compiler::flagsOfHWIntrinsic(intrinsicID) & HW_Flag_NoJmpTableIMM) == 0) + { + GenTree* lastOp = Compiler::lastOpOfHWIntrinsic(intrinsicTree, numArgs); + assert(lastOp != nullptr); + if (Compiler::isImmHWIntrinsic(intrinsicID, lastOp) && !lastOp->isContainedIntOrIImmed()) + { + assert(!lastOp->IsCnsIntOrI()); + + // We need two extra reg when lastOp isn't a constant so + // the offset into the jump table for the fallback path + // can be computed. + + info->internalIntCount = 2; + info->setInternalCandidates(this, allRegs(TYP_INT)); + } + } + switch (intrinsicID) { case NI_SSE_CompareEqualOrderedScalar: @@ -2301,26 +2323,6 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) info->setInternalCandidates(this, allSIMDRegs()); break; - case NI_SSE_Shuffle: - case NI_SSSE3_AlignRight: - { - assert(op1->OperIsList()); - GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current(); - - if (!op3->isContainedIntOrIImmed()) - { - assert(!op3->IsCnsIntOrI()); - - // We need two extra reg when op3 isn't a constant so - // the offset into the jump table for the fallback path - // can be computed. - - info->internalIntCount = 2; - info->setInternalCandidates(this, allRegs(TYP_INT)); - } - break; - } - case NI_SSE_ConvertToSingle: case NI_SSE_StaticCast: case NI_SSE2_ConvertToDouble: diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 265e5d2237..6674a8cd1b 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -15672,6 +15672,9 @@ GenTree* Compiler::fgMorphTree(GenTree* tree, MorphAddrContext* mac) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS { fgSetRngChkTarget(tree); diff --git a/src/jit/namedintrinsiclist.h b/src/jit/namedintrinsiclist.h index 3bc85976d1..cf739328c2 100644 --- a/src/jit/namedintrinsiclist.h +++ b/src/jit/namedintrinsiclist.h @@ -31,7 +31,7 @@ enum NamedIntrinsic : unsigned int #endif }; -#if FEATURE_HW_INTRINSICS && defined(_TARGET_XARCH_) +#if defined(FEATURE_HW_INTRINSICS) && defined(_TARGET_XARCH_) enum HWIntrinsicFlag : unsigned int { HW_Flag_NoFlag = 0, @@ -79,7 +79,15 @@ enum HWIntrinsicFlag : unsigned int HW_Flag_BaseTypeFromArg = 0x400, // Indicates compFloatingPointUsed does not need to be set. - HW_Flag_NoFloatingPointUsed = 0x800 + HW_Flag_NoFloatingPointUsed = 0x800, + + // Maybe IMM + // the intrinsic has either imm or Vector overloads + HW_Flag_MaybeIMM = 0x1000, + + // NoJmpTable IMM + // the imm intrinsic does not need jumptable fallback when it gets non-const argument + HW_Flag_NoJmpTableIMM = 0x2000, }; inline HWIntrinsicFlag operator|(HWIntrinsicFlag c1, HWIntrinsicFlag c2) diff --git a/src/jit/rangecheck.cpp b/src/jit/rangecheck.cpp index 15c01b3a46..57870e6b56 100644 --- a/src/jit/rangecheck.cpp +++ b/src/jit/rangecheck.cpp @@ -224,7 +224,11 @@ void RangeCheck::OptimizeRangeCheck(BasicBlock* block, GenTree* stmt, GenTree* t } else #ifdef FEATURE_SIMD - if (tree->gtOper != GT_SIMD_CHK) + if (tree->gtOper != GT_SIMD_CHK +#ifdef FEATURE_HW_INTRINSICS + && tree->gtOper != GT_HW_INTRINSIC_CHK +#endif // FEATURE_HW_INTRINSICS + ) #endif // FEATURE_SIMD { arrSize = GetArrLength(arrLenVn); diff --git a/src/jit/stacklevelsetter.cpp b/src/jit/stacklevelsetter.cpp index a3d9259257..0694bfdab1 100644 --- a/src/jit/stacklevelsetter.cpp +++ b/src/jit/stacklevelsetter.cpp @@ -133,6 +133,9 @@ void StackLevelSetter::SetThrowHelperBlocks(GenTree* node, BasicBlock* block) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS { GenTreeBoundsChk* bndsChk = node->AsBoundsChk(); SetThrowHelperBlock(bndsChk->gtThrowKind, block); diff --git a/src/jit/valuenum.cpp b/src/jit/valuenum.cpp index 4948be0ddb..3723ff4a06 100644 --- a/src/jit/valuenum.cpp +++ b/src/jit/valuenum.cpp @@ -7119,6 +7119,9 @@ void Compiler::fgValueNumberTree(GenTree* tree, bool evalAsgLhsInd) #ifdef FEATURE_SIMD case GT_SIMD_CHK: #endif // FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS + case GT_HW_INTRINSIC_CHK: +#endif // FEATURE_HW_INTRINSICS { // A bounds check node has no value, but may throw exceptions. ValueNumPair excSet = vnStore->VNPExcSetSingleton( |