summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFei Peng <fei.peng@intel.com>2018-02-26 13:02:12 -0800
committerTanner Gooding <tagoo@outlook.com>2018-02-26 19:08:13 -0800
commite61f7f44f8d115aaf688c3238f0e5d44e40dfb29 (patch)
tree211179946e23da48df6ffe10ac437454700a798c
parentec78ba4a4467fff7ca069a3123788c96eeca2ebc (diff)
downloadcoreclr-e61f7f44f8d115aaf688c3238f0e5d44e40dfb29.tar.gz
coreclr-e61f7f44f8d115aaf688c3238f0e5d44e40dfb29.tar.bz2
coreclr-e61f7f44f8d115aaf688c3238f0e5d44e40dfb29.zip
Update the table-driven framework to support x86 imm-intrinsics.
And add a new range-check IR for x86 imm-intrinsics.
-rw-r--r--src/jit/codegenlinear.h9
-rw-r--r--src/jit/codegenxarch.cpp10
-rw-r--r--src/jit/compiler.cpp3
-rw-r--r--src/jit/compiler.h10
-rw-r--r--src/jit/compiler.hpp3
-rw-r--r--src/jit/emitxarch.cpp60
-rw-r--r--src/jit/emitxarch.h1
-rw-r--r--src/jit/flowgraph.cpp9
-rw-r--r--src/jit/gentree.cpp49
-rw-r--r--src/jit/gentree.h6
-rw-r--r--src/jit/gtlist.h4
-rw-r--r--src/jit/gtstructs.h10
-rw-r--r--src/jit/hwintrinsiccodegenxarch.cpp277
-rw-r--r--src/jit/hwintrinsiclistxarch.h18
-rw-r--r--src/jit/hwintrinsicxarch.cpp222
-rw-r--r--src/jit/instrsxarch.h28
-rw-r--r--src/jit/liveness.cpp3
-rw-r--r--src/jit/lower.cpp3
-rw-r--r--src/jit/lowerxarch.cpp25
-rw-r--r--src/jit/lsraxarch.cpp42
-rw-r--r--src/jit/morph.cpp3
-rw-r--r--src/jit/namedintrinsiclist.h12
-rw-r--r--src/jit/rangecheck.cpp6
-rw-r--r--src/jit/stacklevelsetter.cpp3
-rw-r--r--src/jit/valuenum.cpp3
25 files changed, 606 insertions, 213 deletions
diff --git a/src/jit/codegenlinear.h b/src/jit/codegenlinear.h
index 7727d6da69..8818b9e6c8 100644
--- a/src/jit/codegenlinear.h
+++ b/src/jit/codegenlinear.h
@@ -117,13 +117,10 @@ void genPutArgStkSIMD12(GenTree* treeNode);
#ifdef FEATURE_HW_INTRINSICS
void genHWIntrinsic(GenTreeHWIntrinsic* node);
#if defined(_TARGET_XARCH_)
-void genHWIntrinsic_FullRangeImm8(GenTreeHWIntrinsic* node, instruction ins);
void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins);
void genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins);
void genSSEIntrinsic(GenTreeHWIntrinsic* node);
void genSSE2Intrinsic(GenTreeHWIntrinsic* node);
-void genSSE3Intrinsic(GenTreeHWIntrinsic* node);
-void genSSSE3Intrinsic(GenTreeHWIntrinsic* node);
void genSSE41Intrinsic(GenTreeHWIntrinsic* node);
void genSSE42Intrinsic(GenTreeHWIntrinsic* node);
void genAVXIntrinsic(GenTreeHWIntrinsic* node);
@@ -135,6 +132,12 @@ void genFMAIntrinsic(GenTreeHWIntrinsic* node);
void genLZCNTIntrinsic(GenTreeHWIntrinsic* node);
void genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node);
void genPOPCNTIntrinsic(GenTreeHWIntrinsic* node);
+template <typename HWIntrinsicSwitchCaseBody>
+void genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic,
+ regNumber nonConstImmReg,
+ regNumber baseReg,
+ regNumber offsReg,
+ HWIntrinsicSwitchCaseBody emitSwCase);
#endif // defined(_TARGET_XARCH_)
#if defined(_TARGET_ARM64_)
instruction getOpForHWIntrinsic(GenTreeHWIntrinsic* node, var_types instrType);
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
index 7e0f688033..b5cb2c81ea 100644
--- a/src/jit/codegenxarch.cpp
+++ b/src/jit/codegenxarch.cpp
@@ -1967,6 +1967,9 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
genRangeCheck(treeNode);
break;
@@ -3756,12 +3759,7 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* tree)
// generate code for BoundsCheck nodes
void CodeGen::genRangeCheck(GenTree* oper)
{
-#ifdef FEATURE_SIMD
- noway_assert(oper->OperGet() == GT_ARR_BOUNDS_CHECK || oper->OperGet() == GT_SIMD_CHK);
-#else // !FEATURE_SIMD
- noway_assert(oper->OperGet() == GT_ARR_BOUNDS_CHECK);
-#endif // !FEATURE_SIMD
-
+ noway_assert(oper->OperIsBoundsCheck());
GenTreeBoundsChk* bndsChk = oper->AsBoundsChk();
GenTree* arrIndex = bndsChk->gtIndex;
diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp
index d24072835c..bbf8a9af53 100644
--- a/src/jit/compiler.cpp
+++ b/src/jit/compiler.cpp
@@ -7271,6 +7271,9 @@ void Compiler::CopyTestDataToCloneTree(GenTree* from, GenTree* to)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
CopyTestDataToCloneTree(from->gtBoundsChk.gtIndex, to->gtBoundsChk.gtIndex);
CopyTestDataToCloneTree(from->gtBoundsChk.gtArrLen, to->gtBoundsChk.gtArrLen);
return;
diff --git a/src/jit/compiler.h b/src/jit/compiler.h
index 72dafe3569..9f05442db0 100644
--- a/src/jit/compiler.h
+++ b/src/jit/compiler.h
@@ -3114,12 +3114,17 @@ protected:
bool compSupportsHWIntrinsic(InstructionSet isa);
bool isScalarISA(InstructionSet isa);
static int ivalOfHWIntrinsic(NamedIntrinsic intrinsic);
+ unsigned simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig);
static int numArgsOfHWIntrinsic(NamedIntrinsic intrinsic);
+ static GenTree* lastOpOfHWIntrinsic(GenTreeHWIntrinsic* node, int numArgs);
static instruction insOfHWIntrinsic(NamedIntrinsic intrinsic, var_types type);
static HWIntrinsicCategory categoryOfHWIntrinsic(NamedIntrinsic intrinsic);
static HWIntrinsicFlag flagsOfHWIntrinsic(NamedIntrinsic intrinsic);
GenTree* getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass);
- GenTreeArgList* buildArgList(CORINFO_SIG_INFO* sig);
+ static int immUpperBoundOfHWIntrinsic(NamedIntrinsic intrinsic);
+ GenTree* impNonConstFallback(NamedIntrinsic intrinsic, var_types simdType, var_types baseType);
+ static bool isImmHWIntrinsic(NamedIntrinsic intrinsic, GenTree* lastOp);
+ GenTree* addRangeCheckIfNeeded(NamedIntrinsic intrinsic, GenTree* lastOp, bool mustExpand);
#endif // _TARGET_XARCH_
#ifdef _TARGET_ARM64_
InstructionSet lookupHWIntrinsicISA(const char* className);
@@ -10039,6 +10044,9 @@ public:
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
{
GenTreeBoundsChk* const boundsChk = node->AsBoundsChk();
diff --git a/src/jit/compiler.hpp b/src/jit/compiler.hpp
index 888ef7eaf9..0a1204eed7 100644
--- a/src/jit/compiler.hpp
+++ b/src/jit/compiler.hpp
@@ -4929,6 +4929,9 @@ void GenTree::VisitOperands(TVisitor visitor)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
{
GenTreeBoundsChk* const boundsChk = this->AsBoundsChk();
if (visitor(boundsChk->gtIndex) == VisitResult::Abort)
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp
index 3a2f92b557..7e47ccfb7d 100644
--- a/src/jit/emitxarch.cpp
+++ b/src/jit/emitxarch.cpp
@@ -187,6 +187,14 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
case INS_psubusb:
case INS_psubusw:
case INS_psubw:
+ case INS_pslld:
+ case INS_psllq:
+ case INS_psllw:
+ case INS_psrld:
+ case INS_psrlq:
+ case INS_psrlw:
+ case INS_psrad:
+ case INS_psraw:
case INS_punpckhbw:
case INS_punpckhdq:
case INS_punpckhqdq:
@@ -209,6 +217,11 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
case INS_vinsertf128:
case INS_vinserti128:
case INS_vperm2i128:
+ case INS_vpsrlvd:
+ case INS_vpsrlvq:
+ case INS_vpsravd:
+ case INS_vpsllvd:
+ case INS_vpsllvq:
case INS_xorpd:
case INS_xorps:
return IsAVXInstruction(ins);
@@ -366,7 +379,7 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr)
// size specification (128 vs. 256 bits) and the operand size specification (32 vs. 64 bits), where both are
// required, the instruction must be created with the register size attribute (EA_16BYTE or EA_32BYTE),
// and here we must special case these by the opcode.
- if (ins == INS_vpermq)
+ if (ins == INS_vpermq || ins == INS_vpsrlvq || ins == INS_vpsllvq)
{
return true;
}
@@ -5479,6 +5492,50 @@ void emitter::emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg,
}
}
+static bool isSseShift(instruction ins)
+{
+ switch (ins)
+ {
+ case INS_psrldq:
+ case INS_pslldq:
+ case INS_psrld:
+ case INS_psrlw:
+ case INS_psrlq:
+ case INS_pslld:
+ case INS_psllw:
+ case INS_psllq:
+ case INS_psrad:
+ case INS_psraw:
+ return true;
+ default:
+ return false;
+ }
+}
+
+void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int ival)
+{
+ // TODO-XARCH refactoring emitIns_R_R_I to handle SSE2/AVX2 shift as well as emitIns_R_I
+ bool isShift = isSseShift(ins);
+ if (UseVEXEncoding() && !isShift)
+ {
+ emitIns_R_R_I(ins, attr, reg, reg1, ival);
+ }
+ else
+ {
+ if (reg1 != reg)
+ {
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
+ }
+ // TODO-XARCH-BUG emitOutputRI cannot work with SSE2 shift instruction on imm8 > 127, so we replace it by the
+ // semantic alternatives. https://github.com/dotnet/coreclr/issues/16543
+ if (isShift && ival > 127)
+ {
+ ival = 127;
+ }
+ emitIns_R_I(ins, attr, reg, ival);
+ }
+}
+
void emitter::emitIns_SIMD_R_R_R_R(
instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3)
{
@@ -10746,6 +10803,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
regOpcode = (regNumber)6;
break;
case INS_psrad:
+ case INS_psraw:
regOpcode = (regNumber)4;
break;
default:
diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h
index 6acd835da4..a5bc303719 100644
--- a/src/jit/emitxarch.h
+++ b/src/jit/emitxarch.h
@@ -496,6 +496,7 @@ void emitIns_SIMD_R_R_C(
instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs);
void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs);
void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2);
+void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int ival);
void emitIns_SIMD_R_R_R_R(
instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3);
#endif // FEATURE_HW_INTRINSICS
diff --git a/src/jit/flowgraph.cpp b/src/jit/flowgraph.cpp
index f3a0bf2ced..ccf70dcddf 100644
--- a/src/jit/flowgraph.cpp
+++ b/src/jit/flowgraph.cpp
@@ -9583,6 +9583,9 @@ void Compiler::fgSimpleLowering()
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
{
// Add in a call to an error routine.
fgSetRngChkTarget(tree, false);
@@ -18796,6 +18799,9 @@ void Compiler::fgSetTreeSeqHelper(GenTree* tree, bool isLIR)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
// Evaluate the trees left to right
fgSetTreeSeqHelper(tree->gtBoundsChk.gtIndex, isLIR);
fgSetTreeSeqHelper(tree->gtBoundsChk.gtArrLen, isLIR);
@@ -21426,6 +21432,9 @@ void Compiler::fgDebugCheckFlags(GenTree* tree)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
GenTreeBoundsChk* bndsChk;
bndsChk = tree->AsBoundsChk();
diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp
index 284b420120..9b8f88bd85 100644
--- a/src/jit/gentree.cpp
+++ b/src/jit/gentree.cpp
@@ -306,6 +306,10 @@ void GenTree::InitNodeSize()
#ifdef FEATURE_SIMD
GenTree::s_gtNodeSizes[GT_SIMD_CHK] = TREE_NODE_SZ_LARGE;
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ GenTree::s_gtNodeSizes[GT_HW_INTRINSIC_CHK] = TREE_NODE_SZ_LARGE;
+#endif // FEATURE_HW_INTRINSICS
+
GenTree::s_gtNodeSizes[GT_ARR_ELEM] = TREE_NODE_SZ_LARGE;
GenTree::s_gtNodeSizes[GT_ARR_INDEX] = TREE_NODE_SZ_LARGE;
GenTree::s_gtNodeSizes[GT_ARR_OFFSET] = TREE_NODE_SZ_LARGE;
@@ -1592,6 +1596,9 @@ AGAIN:
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
return Compare(op1->gtBoundsChk.gtIndex, op2->gtBoundsChk.gtIndex) &&
Compare(op1->gtBoundsChk.gtArrLen, op2->gtBoundsChk.gtArrLen) &&
(op1->gtBoundsChk.gtThrowKind == op2->gtBoundsChk.gtThrowKind);
@@ -1818,6 +1825,9 @@ AGAIN:
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
if (gtHasRef(tree->gtBoundsChk.gtIndex, lclNum, defOnly))
{
return true;
@@ -2236,6 +2246,9 @@ AGAIN:
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
hash = genTreeHashAdd(hash, gtHashValue(tree->gtBoundsChk.gtIndex));
hash = genTreeHashAdd(hash, gtHashValue(tree->gtBoundsChk.gtArrLen));
hash = genTreeHashAdd(hash, tree->gtBoundsChk.gtThrowKind);
@@ -2501,6 +2514,9 @@ AGAIN:
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
{
if (!lvaLclVarRefsAccum(tree->gtBoundsChk.gtIndex, findPtr, refsPtr, &allVars, &trkdVars))
{
@@ -4911,7 +4927,11 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
case GT_ARR_BOUNDS_CHECK:
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
-#endif // FEATURE_SIMD
+#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
+
costEx = 4; // cmp reg,reg and jae throw (not taken)
costSz = 7; // jump to cold section
@@ -5444,6 +5464,9 @@ GenTree** GenTree::gtGetChildPointer(GenTree* parent) const
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
if (this == parent->gtBoundsChk.gtIndex)
{
return &(parent->gtBoundsChk.gtIndex);
@@ -5709,6 +5732,9 @@ bool GenTree::TryGetUse(GenTree* def, GenTree*** use)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
{
GenTreeBoundsChk* const boundsChk = this->AsBoundsChk();
if (def == boundsChk->gtIndex)
@@ -6048,6 +6074,9 @@ bool GenTree::OperMayThrow(Compiler* comp)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
case GT_INDEX_ADDR:
return true;
default:
@@ -8192,6 +8221,9 @@ GenTree* Compiler::gtCloneExpr(
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
copy = new (this, oper)
GenTreeBoundsChk(oper, tree->TypeGet(),
gtCloneExpr(tree->gtBoundsChk.gtIndex, addFlags, deepVarNum, deepVarVal),
@@ -8877,6 +8909,9 @@ unsigned GenTree::NumChildren()
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
return 2;
case GT_FIELD:
@@ -9007,6 +9042,9 @@ GenTree* GenTree::GetChild(unsigned childNum)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
switch (childNum)
{
case 0:
@@ -9306,6 +9344,9 @@ GenTreeUseEdgeIterator::GenTreeUseEdgeIterator(GenTree* node)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
m_edge = &m_node->AsBoundsChk()->gtIndex;
assert(*m_edge != nullptr);
m_advance = &GenTreeUseEdgeIterator::AdvanceBoundsChk;
@@ -11855,6 +11896,9 @@ void Compiler::gtDispTree(GenTree* tree,
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
gtDispVN(tree);
printf("\n");
if (!topOnly)
@@ -15804,6 +15848,9 @@ void Compiler::gtExtractSideEffList(GenTree* expr,
#ifdef FEATURE_SIMD
|| expr->OperGet() == GT_SIMD_CHK
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ || expr->OperGet() == GT_HW_INTRINSIC_CHK
+#endif // FEATURE_HW_INTRINSICS
)
{
gtExtractSideEffList(expr->AsBoundsChk()->gtIndex, pList, flags);
diff --git a/src/jit/gentree.h b/src/jit/gentree.h
index e246c46bc3..25d5a286fd 100644
--- a/src/jit/gentree.h
+++ b/src/jit/gentree.h
@@ -1661,6 +1661,12 @@ public:
return true;
}
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ if (op == GT_HW_INTRINSIC_CHK)
+ {
+ return true;
+ }
+#endif // FEATURE_HW_INTRINSICS
return false;
}
diff --git a/src/jit/gtlist.h b/src/jit/gtlist.h
index fde7aa370e..13b8fb6e71 100644
--- a/src/jit/gtlist.h
+++ b/src/jit/gtlist.h
@@ -97,6 +97,10 @@ GTNODE(SIMD_CHK , GenTreeBoundsChk ,0,GTK_SPECIAL|GTK_NOVALUE)// Compa
// does the compare, so that it can be more easily optimized. But that involves generating qmarks at import time...
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+GTNODE(HW_INTRINSIC_CHK , GenTreeBoundsChk ,0,GTK_SPECIAL|GTK_NOVALUE)// Compare whether an imm8 argument is in the valid range, and throw ArgumentOutOfRangeException if not.
+#endif
+
GTNODE(ALLOCOBJ , GenTreeAllocObj ,0,GTK_UNOP|GTK_EXOP) // object allocator
GTNODE(INIT_VAL , GenTreeOp ,0,GTK_UNOP) // Initialization value for an initBlk
diff --git a/src/jit/gtstructs.h b/src/jit/gtstructs.h
index 6cfb92c98e..0df4e35078 100644
--- a/src/jit/gtstructs.h
+++ b/src/jit/gtstructs.h
@@ -85,11 +85,15 @@ GTSTRUCT_1(FptrVal , GT_FTN_ADDR)
GTSTRUCT_1(Intrinsic , GT_INTRINSIC)
GTSTRUCT_1(Index , GT_INDEX)
GTSTRUCT_1(IndexAddr , GT_INDEX_ADDR)
-#ifdef FEATURE_SIMD
+#if defined(FEATURE_HW_INTRINSICS) && defined(FEATURE_SIMD)
+GTSTRUCT_3(BoundsChk , GT_ARR_BOUNDS_CHECK, GT_SIMD_CHK, GT_HW_INTRINSIC_CHK)
+#elif defined(FEATURE_SIMD)
GTSTRUCT_2(BoundsChk , GT_ARR_BOUNDS_CHECK, GT_SIMD_CHK)
-#else // !FEATURE_SIMD
+#elif defined(FEATURE_HW_INTRINSICS)
+GTSTRUCT_2(BoundsChk , GT_ARR_BOUNDS_CHECK, GT_HW_INTRINSIC_CHK)
+#else // !FEATURE_SIMD && !FEATURE_HW_INTRINSICS
GTSTRUCT_1(BoundsChk , GT_ARR_BOUNDS_CHECK)
-#endif // !FEATURE_SIMD
+#endif // !FEATURE_SIMD && !FEATURE_HW_INTRINSICS
GTSTRUCT_1(ArrLen , GT_ARR_LENGTH)
GTSTRUCT_1(ArrElem , GT_ARR_ELEM)
GTSTRUCT_1(ArrOffs , GT_ARR_OFFSET)
diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp
index a4fc9f1875..dbc2e35fcb 100644
--- a/src/jit/hwintrinsiccodegenxarch.cpp
+++ b/src/jit/hwintrinsiccodegenxarch.cpp
@@ -37,8 +37,8 @@ static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsi
{
// TODO - make more categories to the table-driven framework
// HW_Category_Helper and HW_Flag_MultiIns usually need manual codegen
- const bool tableDrivenCategory = category == HW_Category_SimpleSIMD || category == HW_Category_MemoryLoad ||
- category == HW_Category_MemoryStore || category == HW_Category_SIMDScalar;
+ const bool tableDrivenCategory =
+ category != HW_Category_Special && category != HW_Category_Scalar && category != HW_Category_Helper;
const bool tableDrivenFlag = (flags & HW_Flag_MultiIns) == 0;
return tableDrivenCategory && tableDrivenFlag;
}
@@ -75,7 +75,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
assert(numArgs >= 0);
instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
assert(ins != INS_invalid);
- emitAttr simdSize = (emitAttr)(node->gtSIMDSize);
+ emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
assert(simdSize != 0);
switch (numArgs)
@@ -99,9 +99,11 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
case 2:
genConsumeOperands(node);
+ op1Reg = op1->gtRegNum;
+ op2Reg = op2->gtRegNum;
if (category == HW_Category_MemoryStore)
{
- emit->emitIns_AR_R(ins, simdSize, op2->gtRegNum, op1->gtRegNum, 0);
+ emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
}
else if ((ival != -1) && varTypeIsFloating(baseType))
{
@@ -109,7 +111,28 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
}
else if (category == HW_Category_MemoryLoad)
{
- emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1->gtRegNum, op2->gtRegNum);
+ emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
+ }
+ else if (Compiler::isImmHWIntrinsic(intrinsicID, op2))
+ {
+ auto emitSwCase = [&](unsigned i) {
+ emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, (int)i);
+ };
+
+ if (op2->IsCnsIntOrI())
+ {
+ ssize_t ival = op2->AsIntCon()->IconValue();
+ emitSwCase((unsigned)ival);
+ }
+ else
+ {
+ // We emit a fallback case for the scenario when the imm-op is not a constant. This should
+ // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
+ // can also occur if the consumer calls it directly and just doesn't pass a constant value.
+ regNumber baseReg = node->ExtractTempReg();
+ regNumber offsReg = node->GetSingleTempReg();
+ genHWIntrinsicJumpTableFallback(intrinsicID, op2Reg, baseReg, offsReg, emitSwCase);
+ }
}
else
{
@@ -137,7 +160,30 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
genConsumeRegs(op3);
regNumber op3Reg = op3->gtRegNum;
- emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg);
+ if (Compiler::isImmHWIntrinsic(intrinsicID, op3))
+ {
+ auto emitSwCase = [&](unsigned i) {
+ emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, (int)i);
+ };
+ if (op3->IsCnsIntOrI())
+ {
+ ssize_t ival = op3->AsIntCon()->IconValue();
+ emitSwCase((unsigned)ival);
+ }
+ else
+ {
+ // We emit a fallback case for the scenario when the imm-op is not a constant. This should
+ // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
+ // can also occur if the consumer calls it directly and just doesn't pass a constant value.
+ regNumber baseReg = node->ExtractTempReg();
+ regNumber offsReg = node->GetSingleTempReg();
+ genHWIntrinsicJumpTableFallback(intrinsicID, op3Reg, baseReg, offsReg, emitSwCase);
+ }
+ }
+ else
+ {
+ emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg);
+ }
break;
}
@@ -157,12 +203,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
case InstructionSet_SSE2:
genSSE2Intrinsic(node);
break;
- case InstructionSet_SSE3:
- genSSE3Intrinsic(node);
- break;
- case InstructionSet_SSSE3:
- genSSSE3Intrinsic(node);
- break;
case InstructionSet_SSE41:
genSSE41Intrinsic(node);
break;
@@ -202,100 +242,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
}
}
-void CodeGen::genHWIntrinsic_FullRangeImm8(GenTreeHWIntrinsic* node, instruction ins)
-{
- var_types targetType = node->TypeGet();
- regNumber targetReg = node->gtRegNum;
- GenTree* op1 = node->gtGetOp1();
- regNumber op1Reg = REG_NA;
- GenTree* op2 = node->gtGetOp2();
- regNumber op2Reg = REG_NA;
- GenTree* op3 = nullptr;
- emitAttr simdSize = (emitAttr)(node->gtSIMDSize);
- emitter* emit = getEmitter();
-
- GenTreeArgList* argList;
-
- assert(op1->OperIsList());
- assert(op1->AsArgList()->Rest() != nullptr);
- assert(op1->AsArgList()->Rest()->Rest() != nullptr);
- assert(op1->AsArgList()->Rest()->Rest()->Rest() == nullptr);
- assert(op2 == nullptr);
-
- argList = op1->AsArgList();
- op1 = argList->Current();
- op1Reg = op1->gtRegNum;
- genConsumeRegs(op1);
-
- argList = argList->Rest();
- op2 = argList->Current();
- op2Reg = op2->gtRegNum;
- genConsumeRegs(op2);
-
- argList = argList->Rest();
- op3 = argList->Current();
- genConsumeRegs(op3);
-
- if (op3->IsCnsIntOrI())
- {
- ssize_t ival = op3->AsIntConCommon()->IconValue();
- emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, (int)ival);
- }
- else
- {
- // We emit a fallback case for the scenario when op3 is not a constant. This should normally
- // only happen when the intrinsic is called indirectly, such as via Reflection. However, it can
- // also occur if the consumer calls it directly and just doesn't pass a constant value.
-
- const unsigned jmpCount = 256;
- BasicBlock* jmpTable[jmpCount];
-
- unsigned jmpTableBase = emit->emitBBTableDataGenBeg(jmpCount, true);
- unsigned jmpTableOffs = 0;
-
- // Emit the jump table
-
- JITDUMP("\n J_M%03u_DS%02u LABEL DWORD\n", Compiler::s_compMethodsCount, jmpTableBase);
-
- for (unsigned i = 0; i < jmpCount; i++)
- {
- jmpTable[i] = genCreateTempLabel();
- JITDUMP(" DD L_M%03u_BB%02u\n", Compiler::s_compMethodsCount, jmpTable[i]->bbNum);
- emit->emitDataGenData(i, jmpTable[i]);
- }
-
- emit->emitDataGenEnd();
-
- // Compute and jump to the appropriate offset in the switch table
-
- regNumber baseReg = node->ExtractTempReg(); // the start of the switch table
- regNumber offsReg = node->GetSingleTempReg(); // the offset into the switch table
-
- emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
-
- emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, op3->gtRegNum, 4, 0);
- emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
- emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
- emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
-
- // Emit the switch table entries
-
- BasicBlock* switchTableBeg = genCreateTempLabel();
- BasicBlock* switchTableEnd = genCreateTempLabel();
-
- genDefineTempLabel(switchTableBeg);
-
- for (unsigned i = 0; i < jmpCount; i++)
- {
- genDefineTempLabel(jmpTable[i]);
- emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, i);
- emit->emitIns_J(INS_jmp, switchTableEnd);
- }
-
- genDefineTempLabel(switchTableEnd);
- }
-}
-
//------------------------------------------------------------------------
// genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
// register/memory operand, and that returns a value in register
@@ -310,7 +256,7 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
regNumber targetReg = node->gtRegNum;
GenTree* op1 = node->gtGetOp1();
GenTree* op2 = node->gtGetOp2();
- emitAttr simdSize = (emitAttr)(node->gtSIMDSize);
+ emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
emitter* emit = getEmitter();
// TODO-XArch-CQ: Commutative operations can have op1 be contained
@@ -433,7 +379,7 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
regNumber targetReg = node->gtRegNum;
GenTree* op1 = node->gtGetOp1();
GenTree* op2 = node->gtGetOp2();
- emitAttr simdSize = (emitAttr)(node->gtSIMDSize);
+ emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
int ival = Compiler::ivalOfHWIntrinsic(node->gtHWIntrinsicId);
emitter* emit = getEmitter();
@@ -544,6 +490,74 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
}
}
+// genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
+// with non-constant argument
+//
+// Arguments:
+// intrinsic - intrinsic ID
+// nonConstImmReg - the register contains non-constant imm8 argument
+// baseReg - a register for the start of the switch table
+// offsReg - a register for the offset into the switch table
+// emitSwCase - the lambda to generate siwtch-case
+//
+// Return Value:
+// generate the jump-table fallback for imm-intrinsics with non-constant argument.
+// Note:
+// This function can be used for all imm-intrinsics (whether full-range or not),
+// The compiler front-end (i.e. importer) is responsible to insert a range-check IR
+// (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
+//
+template <typename HWIntrinsicSwitchCaseBody>
+void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic,
+ regNumber nonConstImmReg,
+ regNumber baseReg,
+ regNumber offsReg,
+ HWIntrinsicSwitchCaseBody emitSwCase)
+{
+ assert(nonConstImmReg != REG_NA);
+ emitter* emit = getEmitter();
+
+ const unsigned maxByte = (unsigned)Compiler::immUpperBoundOfHWIntrinsic(intrinsic) + 1;
+ assert(maxByte <= 256);
+ BasicBlock* jmpTable[256];
+
+ unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
+ unsigned jmpTableOffs = 0;
+
+ // Emit the jump table
+ for (unsigned i = 0; i < maxByte; i++)
+ {
+ jmpTable[i] = genCreateTempLabel();
+ emit->emitDataGenData(i, jmpTable[i]);
+ }
+
+ emit->emitDataGenEnd();
+
+ // Compute and jump to the appropriate offset in the switch table
+ emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
+
+ emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
+ emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
+ emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
+ emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
+
+ // Emit the switch table entries
+
+ BasicBlock* switchTableBeg = genCreateTempLabel();
+ BasicBlock* switchTableEnd = genCreateTempLabel();
+
+ genDefineTempLabel(switchTableBeg);
+
+ for (unsigned i = 0; i < maxByte; i++)
+ {
+ genDefineTempLabel(jmpTable[i]);
+ emitSwCase(i);
+ emit->emitIns_J(INS_jmp, switchTableEnd);
+ }
+
+ genDefineTempLabel(switchTableEnd);
+}
+
//------------------------------------------------------------------------
// genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
//
@@ -752,10 +766,6 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
break;
}
- case NI_SSE_Shuffle:
- genHWIntrinsic_FullRangeImm8(node, INS_shufps);
- break;
-
case NI_SSE_StoreFence:
{
assert(baseType == TYP_VOID);
@@ -1009,25 +1019,6 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
genProduceReg(node);
}
-void CodeGen::genSSE3Intrinsic(GenTreeHWIntrinsic* node)
-{
- NYI("Implement SSE3 intrinsic code generation");
-}
-
-void CodeGen::genSSSE3Intrinsic(GenTreeHWIntrinsic* node)
-{
- if (node->gtHWIntrinsicId == NI_SSSE3_AlignRight)
- {
- genHWIntrinsic_FullRangeImm8(node, INS_palignr);
- }
- else
- {
- unreached();
- }
-
- genProduceReg(node);
-}
-
//------------------------------------------------------------------------
// genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
//
@@ -1097,12 +1088,34 @@ void CodeGen::genAVXIntrinsic(GenTreeHWIntrinsic* node)
{
NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
var_types baseType = node->gtSIMDBaseType;
- instruction ins = INS_invalid;
+ emitAttr attr = EA_ATTR(node->gtSIMDSize);
+ var_types targetType = node->TypeGet();
+ instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+ GenTree* op1 = node->gtGetOp1();
+ GenTree* op2 = node->gtGetOp2();
+ regNumber targetReg = node->gtRegNum;
+ emitter* emit = getEmitter();
genConsumeOperands(node);
switch (intrinsicID)
{
+ case NI_AVX_SetZeroVector256:
+ {
+ assert(op1 == nullptr);
+ assert(op2 == nullptr);
+ // SetZeroVector256 will generate pxor with integral base-typ, but pxor is a AVX2 instruction, so we
+ // generate xorps on AVX machines.
+ if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsIntegral(baseType))
+ {
+ emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
+ }
+ else
+ {
+ emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
+ }
+ break;
+ }
default:
unreached();
break;
diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h
index b5e30ea4ee..0c28863f17 100644
--- a/src/jit/hwintrinsiclistxarch.h
+++ b/src/jit/hwintrinsiclistxarch.h
@@ -10,7 +10,7 @@
// clang-format off
-#if FEATURE_HW_INTRINSICS
+#ifdef FEATURE_HW_INTRINSICS
/* Note
1) Each hardware intrinsic has a unique Intrinsic ID with type of `enum NamedIntrinsic`
2) All the overloads of an intrinsic in an ISA class share one Intrinsic ID
@@ -209,6 +209,11 @@ HARDWARE_INTRINSIC(SSE2_PackSignedSaturate, "PackSigned
HARDWARE_INTRINSIC(SSE2_PackUnsignedSaturate, "PackUnsignedSaturate", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_packuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromArg)
HARDWARE_INTRINSIC(SSE2_SetZeroVector128, "SetZeroVector128", SSE2, -1, 16, 0, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_Helper, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE2_SumAbsoluteDifferences, "SumAbsoluteDifferences", SSE2, -1, 16, 2, {INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromArg)
+HARDWARE_INTRINSIC(SSE2_ShiftLeftLogical, "ShiftLeftLogical", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_psllw, INS_psllw, INS_pslld, INS_pslld, INS_psllq, INS_psllq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(SSE2_ShiftLeftLogical128BitLane, "ShiftLeftLogical128BitLane", SSE2, -1, 16, 2, {INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(SSE2_ShiftRightArithmetic, "ShiftRightArithmetic", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_psraw, INS_invalid, INS_psrad, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(SSE2_ShiftRightLogical, "ShiftRightLogical", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_psrlw, INS_psrlw, INS_psrld, INS_psrld, INS_psrlq, INS_psrlq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(SSE2_ShiftRightLogical128BitLane, "ShiftRightLogical128BitLane", SSE2, -1, 16, 2, {INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(SSE2_Sqrt, "Sqrt", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE2_SqrtScalar, "SqrtScalar", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE2_Store, "Store", SSE2, -1, 16, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoFlag)
@@ -279,6 +284,8 @@ HARDWARE_INTRINSIC(AVX_And, "And",
HARDWARE_INTRINSIC(AVX_AndNot, "AndNot", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_AddSubtract, "AddSubtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_BlendVariable, "BlendVariable", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_Compare, "Compare", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_CompareScalar, "CompareScalar", AVX, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(AVX_ConvertToSingle, "ConvertToSingle", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_Divide, "Divide", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_divpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_DuplicateEvenIndexed, "DuplicateEvenIndexed", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
@@ -294,6 +301,7 @@ HARDWARE_INTRINSIC(AVX_Multiply, "Multiply",
HARDWARE_INTRINSIC(AVX_Or, "Or", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX_Reciprocal, "Reciprocal", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_ReciprocalSqrt, "ReciprocalSqrt", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_SetZeroVector256, "SetZeroVector256", AVX, -1, 32, 0, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_xorps, INS_xorpd}, HW_Category_Helper, HW_Flag_OneTypeGeneric)
HARDWARE_INTRINSIC(AVX_Sqrt, "Sqrt", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_Store, "Store", AVX, -1, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_StoreAligned, "StoreAligned", AVX, -1, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoFlag)
@@ -321,11 +329,19 @@ HARDWARE_INTRINSIC(AVX2_HorizontalSubtractSaturate, "Horizontal
HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal, "LoadAlignedVector256NonTemporal", AVX2, -1, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_Multiply, "Multiply", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX2_Or, "Or", AVX2, -1, 32, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX2_ShiftLeftLogical, "ShiftLeftLogical", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_psllw, INS_psllw, INS_pslld, INS_pslld, INS_psllq, INS_psllq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(AVX2_ShiftLeftLogical128BitLane, "ShiftLeftLogical128BitLane", AVX2, -1, 32, 2, {INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(AVX2_ShiftLeftLogicalVariable, "ShiftLeftLogicalVariable", AVX2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpsllvd, INS_vpsllvd, INS_vpsllvq, INS_vpsllvq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_UnfixedSIMDSize|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_ShiftRightArithmetic, "ShiftRightArithmetic", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_psraw, INS_invalid, INS_psrad, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(AVX2_ShiftRightLogical, "ShiftRightLogical", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_psrlw, INS_psrlw, INS_psrld, INS_psrld, INS_psrlq, INS_psrlq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(AVX2_ShiftRightLogical128BitLane, "ShiftRightLogical128BitLane", AVX2, -1, 32, 2, {INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
+HARDWARE_INTRINSIC(AVX2_ShiftRightLogicalVariable, "ShiftRightLogicalVariable", AVX2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpsrlvd, INS_vpsrlvd, INS_vpsrlvq, INS_vpsrlvq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_UnfixedSIMDSize|HW_Flag_NoContainment)
HARDWARE_INTRINSIC(AVX2_Subtract, "Subtract", AVX2, -1, 32, 2, {INS_psubb, INS_psubb, INS_psubw, INS_psubw, INS_psubd, INS_psubd, INS_psubq, INS_psubq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_SubtractSaturate, "SubtractSaturate", AVX2, -1, 32, 2, {INS_psubsb, INS_psubusb, INS_psubsw, INS_psubusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_UnpackHigh, "UnpackHigh", AVX2, -1, 32, 2, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq,INS_punpckhqdq,INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_UnpackLow, "UnpackLow", AVX2, -1, 32, 2, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq,INS_punpcklqdq,INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_Xor, "Xor", AVX2, -1, 32, 2, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+
// AES Intrinsics
HARDWARE_INTRINSIC(AES_IsSupported, "get_IsSupported", AES, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp
index 3b6ef766b4..4ab3bf8285 100644
--- a/src/jit/hwintrinsicxarch.cpp
+++ b/src/jit/hwintrinsicxarch.cpp
@@ -4,7 +4,7 @@
#include "jitpch.h"
-#if FEATURE_HW_INTRINSICS
+#ifdef FEATURE_HW_INTRINSICS
struct HWIntrinsicInfo
{
@@ -185,15 +185,21 @@ int Compiler::ivalOfHWIntrinsic(NamedIntrinsic intrinsic)
// Return Value:
// the SIMD size of this intrinsic
// - from the hwIntrinsicInfoArray table if intrinsic has NO HW_Flag_UnfixedSIMDSize
-// - TODO-XArch-NYI - from the signature if intrinsic has HW_Flag_UnfixedSIMDSize
+// - from the signature if intrinsic has HW_Flag_UnfixedSIMDSize
//
// Note - this function is only used by the importer
// after importation (i.e., codegen), we can get the SIMD size from GenTreeHWIntrinsic IR
-static unsigned simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig)
+unsigned Compiler::simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig)
{
assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
- assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flags & HW_Flag_UnfixedSIMDSize) == 0);
- return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize;
+ if ((Compiler::flagsOfHWIntrinsic(intrinsic) & HW_Flag_UnfixedSIMDSize) == 0)
+ {
+ return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize;
+ }
+
+ int simdSize = getSIMDTypeSizeInBytes(sig->retTypeSigClass);
+ assert(simdSize > 0);
+ return (unsigned)simdSize;
}
//------------------------------------------------------------------------
@@ -213,6 +219,41 @@ int Compiler::numArgsOfHWIntrinsic(NamedIntrinsic intrinsic)
}
//------------------------------------------------------------------------
+// lastOpOfHWIntrinsic: get the last operand of a HW intrinsic
+//
+// Arguments:
+// node -- the intrinsic node.
+// numArgs-- number of argument
+//
+// Return Value:
+// number of arguments
+//
+GenTree* Compiler::lastOpOfHWIntrinsic(GenTreeHWIntrinsic* node, int numArgs)
+{
+ GenTree* op1 = node->gtGetOp1();
+ GenTree* op2 = node->gtGetOp2();
+ switch (numArgs)
+ {
+ case 0:
+ return nullptr;
+ case 1:
+ assert(op1 != nullptr);
+ return op1;
+ case 2:
+ assert(op2 != nullptr);
+ return op2;
+ case 3:
+ assert(op1->OperIsList());
+ assert(op1->AsArgList()->Rest()->Rest()->Current() != nullptr);
+ assert(op1->AsArgList()->Rest()->Rest()->Rest() == nullptr);
+ return op1->AsArgList()->Rest()->Rest()->Current();
+ default:
+ unreached();
+ return nullptr;
+ }
+}
+
+//------------------------------------------------------------------------
// insOfHWIntrinsic: get the instruction of the given intrinsic
//
// Arguments:
@@ -281,9 +322,9 @@ GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE
unsigned int argSizeBytes;
var_types base = getBaseTypeAndSizeOfSIMDType(argClass, &argSizeBytes);
argType = getSIMDTypeForSize(argSizeBytes);
- assert(argType == TYP_SIMD32 || argType == TYP_SIMD16);
+ assert((argType == TYP_SIMD32) || (argType == TYP_SIMD16));
arg = impSIMDPopStack(argType);
- assert(arg->TypeGet() == TYP_SIMD16 || arg->TypeGet() == TYP_SIMD32);
+ assert((arg->TypeGet() == TYP_SIMD16) || (arg->TypeGet() == TYP_SIMD32));
}
else
{
@@ -296,6 +337,136 @@ GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE
}
//------------------------------------------------------------------------
+// immUpperBoundOfHWIntrinsic: get the max imm-value of non-full-range IMM intrinsic
+//
+// Arguments:
+// intrinsic -- intrinsic ID
+//
+// Return Value:
+// the max imm-value of non-full-range IMM intrinsic
+//
+int Compiler::immUpperBoundOfHWIntrinsic(NamedIntrinsic intrinsic)
+{
+ assert(categoryOfHWIntrinsic(intrinsic) == HW_Category_IMM);
+ switch (intrinsic)
+ {
+ case NI_AVX_Compare:
+ case NI_AVX_CompareScalar:
+ return 31; // enum FloatComparisonMode has 32 values
+
+ default:
+ assert((flagsOfHWIntrinsic(intrinsic) & HW_Flag_FullRangeIMM) != 0);
+ return 255;
+ }
+}
+
+//------------------------------------------------------------------------
+// impNonConstFallback: convert certain SSE2/AVX2 shift intrinsic to its semantic alternative when the imm-arg is
+// not a compile-time constant
+//
+// Arguments:
+// intrinsic -- intrinsic ID
+// simdType -- Vector type
+// baseType -- base type of the Vector128/256<T>
+//
+// Return Value:
+// return the IR of semantic alternative on non-const imm-arg
+//
+GenTree* Compiler::impNonConstFallback(NamedIntrinsic intrinsic, var_types simdType, var_types baseType)
+{
+ assert((flagsOfHWIntrinsic(intrinsic) & HW_Flag_NoJmpTableIMM) != 0);
+ switch (intrinsic)
+ {
+ case NI_SSE2_ShiftLeftLogical:
+ case NI_SSE2_ShiftRightArithmetic:
+ case NI_SSE2_ShiftRightLogical:
+ case NI_AVX2_ShiftLeftLogical:
+ case NI_AVX2_ShiftRightArithmetic:
+ case NI_AVX2_ShiftRightLogical:
+ {
+ GenTree* op2 = impPopStack().val;
+ GenTree* op1 = impSIMDPopStack(simdType);
+ GenTree* tmpOp =
+ gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_SSE2_ConvertScalarToVector128Int32, TYP_INT, 16);
+ return gtNewSimdHWIntrinsicNode(simdType, op1, tmpOp, intrinsic, baseType, genTypeSize(simdType));
+ }
+
+ default:
+ unreached();
+ return nullptr;
+ }
+}
+
+//------------------------------------------------------------------------
+// isImmHWIntrinsic: check the intrinsic is a imm-intrinsic overload or not
+//
+// Arguments:
+// intrinsic -- intrinsic ID
+// lastOp -- the last operand of the intrinsic that may point to the imm-arg
+//
+// Return Value:
+// Return true iff the intrinsics is an imm-intrinsic overload.
+// Note: that some intrinsics, with HW_Flag_MaybeIMM set, have both imm (integer immediate) and vector (i.e.
+// non-TYP_INT) overloads.
+//
+bool Compiler::isImmHWIntrinsic(NamedIntrinsic intrinsic, GenTree* lastOp)
+{
+ if (categoryOfHWIntrinsic(intrinsic) != HW_Category_IMM)
+ {
+ return false;
+ }
+
+ if ((flagsOfHWIntrinsic(intrinsic) & HW_Flag_MaybeIMM) != 0 && genActualType(lastOp->TypeGet()) != TYP_INT)
+ {
+ return false;
+ }
+
+ return true;
+}
+
+//------------------------------------------------------------------------
+// addRangeCheckIfNeeded: add a GT_HW_INTRINSIC_CHK node for non-full-range imm-intrinsic
+//
+// Arguments:
+// intrinsic -- intrinsic ID
+// lastOp -- the last operand of the intrinsic that points to the imm-arg
+// mustExpand -- true if the compiler is compiling the fallback(GT_CALL) of this intrinsics
+//
+// Return Value:
+// add a GT_HW_INTRINSIC_CHK node for non-full-range imm-intrinsic, which would throw ArgumentOutOfRangeException
+// when the imm-argument is not in the valid range
+//
+GenTree* Compiler::addRangeCheckIfNeeded(NamedIntrinsic intrinsic, GenTree* lastOp, bool mustExpand)
+{
+ assert(lastOp != nullptr);
+ // Full-range imm-intrinsics do not need the range-check
+ // because the imm-parameter of the intrinsic method is a byte.
+ if (mustExpand && ((flagsOfHWIntrinsic(intrinsic) & HW_Flag_FullRangeIMM) == 0) &&
+ isImmHWIntrinsic(intrinsic, lastOp))
+ {
+ assert(!lastOp->IsCnsIntOrI());
+ GenTree* upperBoundNode = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, immUpperBoundOfHWIntrinsic(intrinsic));
+ GenTree* index = nullptr;
+ if ((lastOp->gtFlags & GTF_SIDE_EFFECT) != 0)
+ {
+ index = fgInsertCommaFormTemp(&lastOp);
+ }
+ else
+ {
+ index = gtCloneExpr(lastOp);
+ }
+ GenTreeBoundsChk* hwIntrinsicChk = new (this, GT_HW_INTRINSIC_CHK)
+ GenTreeBoundsChk(GT_HW_INTRINSIC_CHK, TYP_VOID, index, upperBoundNode, SCK_RNGCHK_FAIL);
+ hwIntrinsicChk->gtThrowKind = SCK_ARG_RNG_EXCPN;
+ return gtNewOperNode(GT_COMMA, lastOp->TypeGet(), hwIntrinsicChk, lastOp);
+ }
+ else
+ {
+ return lastOp;
+ }
+}
+
+//------------------------------------------------------------------------
// isFullyImplmentedISAClass: return true if all the hardware intrinsics
// of this ISA are implemented in RyuJIT.
//
@@ -445,16 +616,35 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
{
return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
}
- else if (category == HW_Category_IMM)
+ // Avoid checking stacktop for 0-op intrinsics
+ if (sig->numArgs > 0 && isImmHWIntrinsic(intrinsic, impStackTop().val))
{
GenTree* lastOp = impStackTop().val;
- if (!lastOp->IsCnsIntOrI() && !mustExpand)
+ // The imm-HWintrinsics that do not accept all imm8 values may throw
+ // ArgumentOutOfRangeException when the imm argument is not in the valid range
+ if ((flags & HW_Flag_FullRangeIMM) == 0)
{
- // When the imm-argument is not a constant and we are not being forced to expand, we need to
- // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The
- // intrinsic method is recursive and will be forced to expand, at which point
- // we emit some less efficient fallback code.
- return nullptr;
+ if (!mustExpand && lastOp->IsCnsIntOrI() &&
+ lastOp->AsIntCon()->IconValue() > immUpperBoundOfHWIntrinsic(intrinsic))
+ {
+ return nullptr;
+ }
+ }
+
+ if (!lastOp->IsCnsIntOrI())
+ {
+ if ((flags & HW_Flag_NoJmpTableIMM) == 0 && !mustExpand)
+ {
+ // When the imm-argument is not a constant and we are not being forced to expand, we need to
+ // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The
+ // intrinsic method is recursive and will be forced to expand, at which point
+ // we emit some less efficient fallback code.
+ return nullptr;
+ }
+ else if ((flags & HW_Flag_NoJmpTableIMM) != 0)
+ {
+ return impNonConstFallback(intrinsic, retType, baseType);
+ }
}
}
@@ -537,6 +727,8 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
strip(info.compCompHnd->getArgType(sig, info.compCompHnd->getArgNext(argList), &argClass)));
op2 = getArgForHWIntrinsic(argType, argClass);
+ op2 = addRangeCheckIfNeeded(intrinsic, op2, mustExpand);
+
argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
op1 = getArgForHWIntrinsic(argType, argClass);
@@ -551,6 +743,8 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg3, &argClass)));
GenTree* op3 = getArgForHWIntrinsic(argType, argClass);
+ op3 = addRangeCheckIfNeeded(intrinsic, op3, mustExpand);
+
argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass)));
op2 = getArgForHWIntrinsic(argType, argClass);
diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h
index b23b44539f..3d314a699f 100644
--- a/src/jit/instrsxarch.h
+++ b/src/jit/instrsxarch.h
@@ -347,14 +347,14 @@ INST3( psubusw, "psubusw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE,
// which is handled in emitxarch.cpp.
INST3( psrldq, "psrldq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Shift right logical of xmm reg by given number of bytes
INST3( pslldq, "pslldq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Shift left logical of xmm reg by given number of bytes
-INST3( psllq, "psllq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Packed shift left logical of 64-bit integers
-INST3( psrlq, "psrlq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Packed shift right logical of 64-bit integers
-INST3( pslld, "pslld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift left logical of 32-bit integers
-INST3( psrld, "psrld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift right logical of 32-bit integers
-INST3( psllw, "psllw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift left logical of 16-bit integers
-INST3( psrlw, "psrlw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift right logical of 16-bit integers
-INST3( psrad, "psrad" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift right arithmetic of 32-bit integers
-INST3( psraw, "psraw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift right arithmetic of 16-bit integers
+INST3( psllw, "psllw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), PCKDBL(0xF1)) // Packed shift left logical of 16-bit integers
+INST3( pslld, "pslld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), PCKDBL(0xF2)) // Packed shift left logical of 32-bit integers
+INST3( psllq, "psllq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), PCKDBL(0xF3)) // Packed shift left logical of 64-bit integers
+INST3( psrlw, "psrlw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), PCKDBL(0xD1)) // Packed shift right logical of 16-bit integers
+INST3( psrld, "psrld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), PCKDBL(0xD2)) // Packed shift right logical of 32-bit integers
+INST3( psrlq, "psrlq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), PCKDBL(0xD3)) // Packed shift right logical of 64-bit integers
+INST3( psraw, "psraw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), PCKDBL(0xE1)) // Packed shift right arithmetic of 16-bit integers
+INST3( psrad, "psrad" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), PCKDBL(0xE2)) // Packed shift right arithmetic of 32-bit integers
INST3( pmaxub, "pmaxub" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDE)) // packed maximum unsigned bytes
INST3( pminub, "pminub" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDA)) // packed minimum unsigned bytes
@@ -455,9 +455,15 @@ INST3( vinserti128, "inserti128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS
INST3( vzeroupper, "zeroupper" , 0, IUM_WR, 0, 0, 0xC577F8, BAD_CODE, BAD_CODE) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
INST3( vperm2i128, "perm2i128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x46)) // Permute 128-bit halves of input register
INST3( vpermq, "permq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x00)) // Permute 64-bit of input register
-INST3( vblendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4A)) // Variable Blend Packed Singles
-INST3( vblendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4B)) // Variable Blend Packed Doubles
-INST3( vpblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4C)) // Variable Blend Packed Bytes
+INST3( vblendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4A)) // Variable Blend Packed Singles
+INST3( vblendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4B)) // Variable Blend Packed Doubles
+INST3( vpblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4C)) // Variable Blend Packed Bytes
+
+INST3( vpsrlvd, "psrlvd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x45)) // Variable Bit Shift Right Logical
+INST3( vpsrlvq, "psrlvq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x45)) // Variable Bit Shift Right Logical
+INST3( vpsravd, "psravd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x46)) // Variable Bit Shift Right Arithmetic
+INST3( vpsllvd, "psllvd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x47)) // Variable Bit Shift Left Logical
+INST3( vpsllvq, "psllvq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x47)) // Variable Bit Shift Left Logical
INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
// Scalar instructions in SSE4.2
diff --git a/src/jit/liveness.cpp b/src/jit/liveness.cpp
index 6793bf545f..dac5a00e1b 100644
--- a/src/jit/liveness.cpp
+++ b/src/jit/liveness.cpp
@@ -2181,6 +2181,9 @@ void Compiler::fgComputeLifeLIR(VARSET_TP& life, BasicBlock* block, VARSET_VALAR
#if defined(FEATURE_SIMD)
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
case GT_JCMP:
case GT_CMP:
case GT_JCC:
diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp
index 28258b1faf..4459c45d99 100644
--- a/src/jit/lower.cpp
+++ b/src/jit/lower.cpp
@@ -204,6 +204,9 @@ GenTree* Lowering::LowerNode(GenTree* node)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
ContainCheckBoundsChk(node->AsBoundsChk());
break;
#endif // _TARGET_XARCH_
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 8eb9e164cf..3f1deb825f 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -2412,27 +2412,18 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}
}
- else if (numArgs == 3)
+
+ if (Compiler::categoryOfHWIntrinsic(intrinsicID) == HW_Category_IMM)
{
- switch (category)
+ assert(numArgs >= 2);
+ GenTree* lastOp = Compiler::lastOpOfHWIntrinsic(node, numArgs);
+ assert(lastOp != nullptr);
+ if (Compiler::isImmHWIntrinsic(intrinsicID, lastOp))
{
- case HW_Category_IMM:
+ if (lastOp->IsCnsIntOrI())
{
- assert(op1->OperIsList());
- GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current();
-
- if (op3->IsCnsIntOrI())
- {
- MakeSrcContained(node, op3);
- }
- break;
+ MakeSrcContained(node, lastOp);
}
-
- default:
- // TODO-XArch-CQ: Assert that this is unreached after we have ensured the relevant node types are
- // handled.
- // https://github.com/dotnet/coreclr/issues/16497
- break;
}
}
}
diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp
index 365a658257..6ff82a78ee 100644
--- a/src/jit/lsraxarch.cpp
+++ b/src/jit/lsraxarch.cpp
@@ -449,6 +449,9 @@ void LinearScan::BuildNode(GenTree* tree)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
// Consumes arrLen & index - has no result
info->srcCount = 2;
assert(info->dstCount == 0);
@@ -2253,6 +2256,7 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
TreeNodeInfo* info = currentNodeInfo;
NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId;
InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID);
+ int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID);
if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2)
{
SetContainsAVXFlags(true, 32);
@@ -2281,6 +2285,24 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
info->srcCount += GetOperandInfo(op2);
}
+ if (Compiler::categoryOfHWIntrinsic(intrinsicID) == HW_Category_IMM &&
+ (Compiler::flagsOfHWIntrinsic(intrinsicID) & HW_Flag_NoJmpTableIMM) == 0)
+ {
+ GenTree* lastOp = Compiler::lastOpOfHWIntrinsic(intrinsicTree, numArgs);
+ assert(lastOp != nullptr);
+ if (Compiler::isImmHWIntrinsic(intrinsicID, lastOp) && !lastOp->isContainedIntOrIImmed())
+ {
+ assert(!lastOp->IsCnsIntOrI());
+
+ // We need two extra reg when lastOp isn't a constant so
+ // the offset into the jump table for the fallback path
+ // can be computed.
+
+ info->internalIntCount = 2;
+ info->setInternalCandidates(this, allRegs(TYP_INT));
+ }
+ }
+
switch (intrinsicID)
{
case NI_SSE_CompareEqualOrderedScalar:
@@ -2301,26 +2323,6 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
info->setInternalCandidates(this, allSIMDRegs());
break;
- case NI_SSE_Shuffle:
- case NI_SSSE3_AlignRight:
- {
- assert(op1->OperIsList());
- GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current();
-
- if (!op3->isContainedIntOrIImmed())
- {
- assert(!op3->IsCnsIntOrI());
-
- // We need two extra reg when op3 isn't a constant so
- // the offset into the jump table for the fallback path
- // can be computed.
-
- info->internalIntCount = 2;
- info->setInternalCandidates(this, allRegs(TYP_INT));
- }
- break;
- }
-
case NI_SSE_ConvertToSingle:
case NI_SSE_StaticCast:
case NI_SSE2_ConvertToDouble:
diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp
index 265e5d2237..6674a8cd1b 100644
--- a/src/jit/morph.cpp
+++ b/src/jit/morph.cpp
@@ -15672,6 +15672,9 @@ GenTree* Compiler::fgMorphTree(GenTree* tree, MorphAddrContext* mac)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
{
fgSetRngChkTarget(tree);
diff --git a/src/jit/namedintrinsiclist.h b/src/jit/namedintrinsiclist.h
index 3bc85976d1..cf739328c2 100644
--- a/src/jit/namedintrinsiclist.h
+++ b/src/jit/namedintrinsiclist.h
@@ -31,7 +31,7 @@ enum NamedIntrinsic : unsigned int
#endif
};
-#if FEATURE_HW_INTRINSICS && defined(_TARGET_XARCH_)
+#if defined(FEATURE_HW_INTRINSICS) && defined(_TARGET_XARCH_)
enum HWIntrinsicFlag : unsigned int
{
HW_Flag_NoFlag = 0,
@@ -79,7 +79,15 @@ enum HWIntrinsicFlag : unsigned int
HW_Flag_BaseTypeFromArg = 0x400,
// Indicates compFloatingPointUsed does not need to be set.
- HW_Flag_NoFloatingPointUsed = 0x800
+ HW_Flag_NoFloatingPointUsed = 0x800,
+
+ // Maybe IMM
+ // the intrinsic has either imm or Vector overloads
+ HW_Flag_MaybeIMM = 0x1000,
+
+ // NoJmpTable IMM
+ // the imm intrinsic does not need jumptable fallback when it gets non-const argument
+ HW_Flag_NoJmpTableIMM = 0x2000,
};
inline HWIntrinsicFlag operator|(HWIntrinsicFlag c1, HWIntrinsicFlag c2)
diff --git a/src/jit/rangecheck.cpp b/src/jit/rangecheck.cpp
index 15c01b3a46..57870e6b56 100644
--- a/src/jit/rangecheck.cpp
+++ b/src/jit/rangecheck.cpp
@@ -224,7 +224,11 @@ void RangeCheck::OptimizeRangeCheck(BasicBlock* block, GenTree* stmt, GenTree* t
}
else
#ifdef FEATURE_SIMD
- if (tree->gtOper != GT_SIMD_CHK)
+ if (tree->gtOper != GT_SIMD_CHK
+#ifdef FEATURE_HW_INTRINSICS
+ && tree->gtOper != GT_HW_INTRINSIC_CHK
+#endif // FEATURE_HW_INTRINSICS
+ )
#endif // FEATURE_SIMD
{
arrSize = GetArrLength(arrLenVn);
diff --git a/src/jit/stacklevelsetter.cpp b/src/jit/stacklevelsetter.cpp
index a3d9259257..0694bfdab1 100644
--- a/src/jit/stacklevelsetter.cpp
+++ b/src/jit/stacklevelsetter.cpp
@@ -133,6 +133,9 @@ void StackLevelSetter::SetThrowHelperBlocks(GenTree* node, BasicBlock* block)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
{
GenTreeBoundsChk* bndsChk = node->AsBoundsChk();
SetThrowHelperBlock(bndsChk->gtThrowKind, block);
diff --git a/src/jit/valuenum.cpp b/src/jit/valuenum.cpp
index 4948be0ddb..3723ff4a06 100644
--- a/src/jit/valuenum.cpp
+++ b/src/jit/valuenum.cpp
@@ -7119,6 +7119,9 @@ void Compiler::fgValueNumberTree(GenTree* tree, bool evalAsgLhsInd)
#ifdef FEATURE_SIMD
case GT_SIMD_CHK:
#endif // FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HW_INTRINSIC_CHK:
+#endif // FEATURE_HW_INTRINSICS
{
// A bounds check node has no value, but may throw exceptions.
ValueNumPair excSet = vnStore->VNPExcSetSingleton(