summaryrefslogtreecommitdiff
path: root/src/jit
diff options
context:
space:
mode:
authorFei Peng <fei.peng@intel.com>2018-01-19 00:01:21 -0800
committerFei Peng <fei.peng@intel.com>2018-01-19 00:01:21 -0800
commit654a8d5ff0cf1a7e0968c68a4b84aecf03ed9c1c (patch)
tree2cebb82cd42b3cec6de0ca3e0d50b6954db6263b /src/jit
parentfffd34540b50cee2ae5a3e1215e525f029eec124 (diff)
downloadcoreclr-654a8d5ff0cf1a7e0968c68a4b84aecf03ed9c1c.tar.gz
coreclr-654a8d5ff0cf1a7e0968c68a4b84aecf03ed9c1c.tar.bz2
coreclr-654a8d5ff0cf1a7e0968c68a4b84aecf03ed9c1c.zip
Merge SSE intrinsics into the table-driven framework
Diffstat (limited to 'src/jit')
-rw-r--r--src/jit/compiler.h2
-rw-r--r--src/jit/emitxarch.cpp44
-rw-r--r--src/jit/emitxarch.h19
-rw-r--r--src/jit/hwintrinsiccodegenxarch.cpp216
-rw-r--r--src/jit/hwintrinsiclistxarch.h132
-rw-r--r--src/jit/hwintrinsicxarch.cpp292
-rw-r--r--src/jit/instrsxarch.h1
-rw-r--r--src/jit/lowerxarch.cpp12
-rw-r--r--src/jit/lsraxarch.cpp3
-rw-r--r--src/jit/namedintrinsiclist.h26
10 files changed, 267 insertions, 480 deletions
diff --git a/src/jit/compiler.h b/src/jit/compiler.h
index ea1c6b2ac4..f48a7b7965 100644
--- a/src/jit/compiler.h
+++ b/src/jit/compiler.h
@@ -3122,7 +3122,7 @@ protected:
static int numArgsOfHWIntrinsic(NamedIntrinsic intrinsic);
static instruction insOfHWIntrinsic(NamedIntrinsic intrinsic, var_types type);
static HWIntrinsicCategory categoryOfHWIntrinsic(NamedIntrinsic intrinsic);
- static HWIntrinsicFlag flagOfHWIntrinsic(NamedIntrinsic intrinsic);
+ static HWIntrinsicFlag flagsOfHWIntrinsic(NamedIntrinsic intrinsic);
GenTree* getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass);
GenTreeArgList* buildArgList(CORINFO_SIG_INFO* sig);
#endif // _TARGET_XARCH_
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp
index 0f6a9ff059..73f28d94c4 100644
--- a/src/jit/emitxarch.cpp
+++ b/src/jit/emitxarch.cpp
@@ -4228,6 +4228,7 @@ void emitter::emitIns_R_R_S_I(
emitCurIGsize += sz;
}
+#ifdef DEBUG
static bool isAvxBlendv(instruction ins)
{
return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb;
@@ -4237,6 +4238,7 @@ static bool isSse41Blendv(instruction ins)
{
return ins == INS_blendvps || ins == INS_blendvpd || ins == INS_pblendvb;
}
+#endif
void emitter::emitIns_R_R_R_R(
instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, regNumber reg3)
@@ -5216,23 +5218,23 @@ void emitter::emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg,
{
emitIns_R_R(INS_movaps, attr, reg, reg1);
}
- emitIns_R_A(ins, emitTypeSize(simdtype), reg, indir, IF_RRW_ARD);
+ emitIns_R_A(ins, attr, reg, indir, IF_RRW_ARD);
}
}
-void emitter::emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1, regNumber base, var_types simdtype)
+void emitter::emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber base)
{
if (UseVEXEncoding())
{
- emitIns_R_R_AR(ins, emitTypeSize(simdtype), reg, reg1, base, 0);
+ emitIns_R_R_AR(ins, attr, reg, reg1, base, 0);
}
else
{
if (reg1 != reg)
{
- emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
}
- emitIns_R_AR(ins, emitTypeSize(simdtype), reg, base, 0);
+ emitIns_R_AR(ins, attr, reg, base, 0);
}
}
@@ -5325,70 +5327,70 @@ void emitter::emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg,
}
void emitter::emitIns_SIMD_R_R_A_I(
- instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival, var_types simdtype)
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival)
{
if (UseVEXEncoding())
{
- emitIns_R_R_A_I(ins, emitTypeSize(simdtype), reg, reg1, indir, ival, IF_RWR_RRD_ARD_CNS);
+ emitIns_R_R_A_I(ins, attr, reg, reg1, indir, ival, IF_RWR_RRD_ARD_CNS);
}
else
{
if (reg1 != reg)
{
- emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
}
- emitIns_R_A_I(ins, emitTypeSize(simdtype), reg, indir, ival);
+ emitIns_R_A_I(ins, attr, reg, indir, ival);
}
}
void emitter::emitIns_SIMD_R_R_C_I(
- instruction ins, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival, var_types simdtype)
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival)
{
if (UseVEXEncoding())
{
- emitIns_R_R_C_I(ins, emitTypeSize(simdtype), reg, reg1, fldHnd, offs, ival);
+ emitIns_R_R_C_I(ins, attr, reg, reg1, fldHnd, offs, ival);
}
else
{
if (reg1 != reg)
{
- emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
}
- emitIns_R_C_I(ins, emitTypeSize(simdtype), reg, fldHnd, offs, ival);
+ emitIns_R_C_I(ins, attr, reg, fldHnd, offs, ival);
}
}
void emitter::emitIns_SIMD_R_R_R_I(
- instruction ins, regNumber reg, regNumber reg1, regNumber reg2, int ival, var_types simdtype)
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int ival)
{
if (UseVEXEncoding())
{
- emitIns_R_R_R_I(ins, emitTypeSize(simdtype), reg, reg1, reg2, ival);
+ emitIns_R_R_R_I(ins, attr, reg, reg1, reg2, ival);
}
else
{
if (reg1 != reg)
{
- emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
}
- emitIns_R_R_I(ins, emitTypeSize(simdtype), reg, reg2, ival);
+ emitIns_R_R_I(ins, attr, reg, reg2, ival);
}
}
void emitter::emitIns_SIMD_R_R_S_I(
- instruction ins, regNumber reg, regNumber reg1, int varx, int offs, int ival, var_types simdtype)
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs, int ival)
{
if (UseVEXEncoding())
{
- emitIns_R_R_S_I(ins, emitTypeSize(simdtype), reg, reg1, varx, offs, ival);
+ emitIns_R_R_S_I(ins, attr, reg, reg1, varx, offs, ival);
}
else
{
if (reg1 != reg)
{
- emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1);
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
}
- emitIns_R_S_I(ins, emitTypeSize(simdtype), reg, varx, offs, ival);
+ emitIns_R_S_I(ins, attr, reg, varx, offs, ival);
}
}
#endif
diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h
index 0473447871..f2e426d07a 100644
--- a/src/jit/emitxarch.h
+++ b/src/jit/emitxarch.h
@@ -455,19 +455,12 @@ void emitIns_R_AX(instruction ins, emitAttr attr, regNumber ireg, regNumber reg,
void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp);
#if FEATURE_HW_INTRINSICS
-void emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1, regNumber base, var_types simdtype);
-void emitIns_SIMD_R_R_A_I(
- instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival, var_types simdtype);
-void emitIns_SIMD_R_R_C_I(instruction ins,
- regNumber reg,
- regNumber reg1,
- CORINFO_FIELD_HANDLE fldHnd,
- int offs,
- int ival,
- var_types simdtype);
-void emitIns_SIMD_R_R_R_I(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, int ival, var_types simdtype);
-void emitIns_SIMD_R_R_S_I(
- instruction ins, regNumber reg, regNumber reg1, int varx, int offs, int ival, var_types simdtype);
+void emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber base);
+void emitIns_SIMD_R_R_A_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival);
+void emitIns_SIMD_R_R_C_I(
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival);
+void emitIns_SIMD_R_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int ival);
+void emitIns_SIMD_R_R_S_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs, int ival);
void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir);
void emitIns_SIMD_R_R_C(
instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs);
diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp
index 6ea0de7e23..69b3cf54ba 100644
--- a/src/jit/hwintrinsiccodegenxarch.cpp
+++ b/src/jit/hwintrinsiccodegenxarch.cpp
@@ -33,12 +33,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
// Return Value:
// returns true if this category can be table-driven in CodeGen
//
-static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category)
+static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags)
{
// TODO - make more categories to the table-driven framework
- const bool tableDrivenIntrinsic = category == HW_Category_SimpleSIMD;
- const bool nonTableDrivenIntrinsic = category == HW_Category_Special;
- return tableDrivenIntrinsic && !nonTableDrivenIntrinsic;
+ // HW_Category_Helper and HW_Flag_MultiIns usually need manual codegen
+ const bool tableDrivenCategory =
+ category == HW_Category_SimpleSIMD || category == HW_Category_MemoryLoad || category == HW_Category_SIMDScalar;
+ const bool tableDrivenFlag = (flags & HW_Flag_MultiIns) == 0;
+ return tableDrivenCategory && tableDrivenFlag;
}
void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
@@ -46,12 +48,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID);
HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID);
- HWIntrinsicFlag flag = Compiler::flagOfHWIntrinsic(intrinsicID);
+ HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID);
+ int ival = Compiler::ivalOfHWIntrinsic(intrinsicID);
int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID);
- assert((flag & HW_Flag_NoCodeGen) == 0);
+ assert((flags & HW_Flag_NoCodeGen) == 0);
- if (genIsTableDrivenHWIntrinsic(category))
+ if (genIsTableDrivenHWIntrinsic(category, flags))
{
GenTree* op1 = node->gtGetOp1();
GenTree* op2 = node->gtGetOp2();
@@ -74,10 +77,35 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
case 1:
genConsumeOperands(node);
op1Reg = op1->gtRegNum;
- emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
+ if (category == HW_Category_MemoryLoad)
+ {
+ emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
+ }
+ else if (category == HW_Category_SIMDScalar && (flags & HW_Flag_CopyUpperBits) != 0)
+ {
+ emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
+ }
+ else
+ {
+
+ emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
+ }
break;
+
case 2:
- genHWIntrinsic_R_R_RM(node, ins);
+ genConsumeOperands(node);
+ if (ival != -1)
+ {
+ genHWIntrinsic_R_R_RM_I(node, ins);
+ }
+ else if (category == HW_Category_MemoryLoad)
+ {
+ emit->emitIns_SIMD_R_R_AR(ins, emitTypeSize(TYP_SIMD16), targetReg, op1->gtRegNum, op2->gtRegNum);
+ }
+ else
+ {
+ genHWIntrinsic_R_R_RM(node, ins);
+ }
break;
case 3:
{
@@ -329,14 +357,14 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
case GT_CLS_VAR_ADDR:
{
- emit->emitIns_SIMD_R_R_C_I(ins, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0, ival,
- targetType);
+ emit->emitIns_SIMD_R_R_C_I(ins, emitTypeSize(targetType), targetReg, op1Reg,
+ memBase->gtClsVar.gtClsVarHnd, 0, ival);
return;
}
default:
{
- emit->emitIns_SIMD_R_R_A_I(ins, targetReg, op1Reg, memIndir, ival, targetType);
+ emit->emitIns_SIMD_R_R_A_I(ins, emitTypeSize(targetType), targetReg, op1Reg, memIndir, ival);
return;
}
}
@@ -374,11 +402,11 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
assert(offset != (unsigned)-1);
- emit->emitIns_SIMD_R_R_S_I(ins, targetReg, op1Reg, varNum, offset, ival, targetType);
+ emit->emitIns_SIMD_R_R_S_I(ins, emitTypeSize(targetType), targetReg, op1Reg, varNum, offset, ival);
}
else
{
- emit->emitIns_SIMD_R_R_R_I(ins, targetReg, op1Reg, op2->gtRegNum, ival, targetType);
+ emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(targetType), targetReg, op1Reg, op2->gtRegNum, ival);
}
}
@@ -392,7 +420,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
regNumber targetReg = node->gtRegNum;
var_types targetType = node->TypeGet();
var_types baseType = node->gtSIMDBaseType;
- instruction ins = INS_invalid;
+ instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
regNumber op1Reg = REG_NA;
regNumber op2Reg = REG_NA;
@@ -408,28 +436,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
switch (intrinsicID)
{
- case NI_SSE_Add:
- case NI_SSE_AddScalar:
- case NI_SSE_And:
- case NI_SSE_AndNot:
case NI_SSE_ConvertToVector128SingleScalar:
- case NI_SSE_Divide:
- case NI_SSE_DivideScalar:
- case NI_SSE_Max:
- case NI_SSE_MaxScalar:
- case NI_SSE_Min:
- case NI_SSE_MinScalar:
- case NI_SSE_MoveHighToLow:
- case NI_SSE_MoveLowToHigh:
- case NI_SSE_MoveScalar:
- case NI_SSE_Multiply:
- case NI_SSE_MultiplyScalar:
- case NI_SSE_Or:
- case NI_SSE_Subtract:
- case NI_SSE_SubtractScalar:
- case NI_SSE_UnpackHigh:
- case NI_SSE_UnpackLow:
- case NI_SSE_Xor:
{
assert(node->TypeGet() == TYP_SIMD16);
assert(node->gtSIMDBaseType == TYP_FLOAT);
@@ -439,51 +446,14 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
genHWIntrinsic_R_R_RM(node, ins);
break;
}
-
- case NI_SSE_CompareEqual:
- case NI_SSE_CompareEqualScalar:
- case NI_SSE_CompareGreaterThan:
- case NI_SSE_CompareGreaterThanScalar:
- case NI_SSE_CompareGreaterThanOrEqual:
- case NI_SSE_CompareGreaterThanOrEqualScalar:
- case NI_SSE_CompareLessThan:
- case NI_SSE_CompareLessThanScalar:
- case NI_SSE_CompareLessThanOrEqual:
- case NI_SSE_CompareLessThanOrEqualScalar:
- case NI_SSE_CompareNotEqual:
- case NI_SSE_CompareNotEqualScalar:
- case NI_SSE_CompareNotGreaterThan:
- case NI_SSE_CompareNotGreaterThanScalar:
- case NI_SSE_CompareNotGreaterThanOrEqual:
- case NI_SSE_CompareNotGreaterThanOrEqualScalar:
- case NI_SSE_CompareNotLessThan:
- case NI_SSE_CompareNotLessThanScalar:
- case NI_SSE_CompareNotLessThanOrEqual:
- case NI_SSE_CompareNotLessThanOrEqualScalar:
- case NI_SSE_CompareOrdered:
- case NI_SSE_CompareOrderedScalar:
- case NI_SSE_CompareUnordered:
- case NI_SSE_CompareUnorderedScalar:
- {
- assert(node->TypeGet() == TYP_SIMD16);
- assert(node->gtSIMDBaseType == TYP_FLOAT);
- assert(Compiler::ivalOfHWIntrinsic(intrinsicID) != -1);
-
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
- genHWIntrinsic_R_R_RM_I(node, ins);
- break;
- }
-
case NI_SSE_CompareEqualOrderedScalar:
case NI_SSE_CompareEqualUnorderedScalar:
{
assert(baseType == TYP_FLOAT);
- op2Reg = op2->gtRegNum;
-
- regNumber tmpReg = node->GetSingleTempReg();
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
+ op2Reg = op2->gtRegNum;
+ regNumber tmpReg = node->GetSingleTempReg();
- emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16);
+ emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
@@ -498,8 +468,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
assert(baseType == TYP_FLOAT);
op2Reg = op2->gtRegNum;
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
- emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16);
+ emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
break;
@@ -511,8 +480,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
assert(baseType == TYP_FLOAT);
op2Reg = op2->gtRegNum;
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
- emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16);
+ emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
break;
@@ -524,8 +492,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
assert(baseType == TYP_FLOAT);
op2Reg = op2->gtRegNum;
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
- emit->emitIns_SIMD_R_R(ins, op2Reg, op1Reg, TYP_SIMD16);
+ emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
break;
@@ -537,8 +504,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
assert(baseType == TYP_FLOAT);
op2Reg = op2->gtRegNum;
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
- emit->emitIns_SIMD_R_R(ins, op2Reg, op1Reg, TYP_SIMD16);
+ emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
break;
@@ -550,10 +516,9 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
assert(baseType == TYP_FLOAT);
op2Reg = op2->gtRegNum;
- regNumber tmpReg = node->GetSingleTempReg();
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
+ regNumber tmpReg = node->GetSingleTempReg();
- emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16);
+ emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
@@ -562,84 +527,23 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
break;
}
- case NI_SSE_ConvertToInt32:
- case NI_SSE_ConvertToInt32WithTruncation:
- case NI_SSE_ConvertToInt64:
- case NI_SSE_ConvertToInt64WithTruncation:
- case NI_SSE_Reciprocal:
- case NI_SSE_ReciprocalSqrt:
- case NI_SSE_Sqrt:
- {
- assert(baseType == TYP_FLOAT);
- assert(op2 == nullptr);
-
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
- emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_SIMD16);
- break;
- }
-
case NI_SSE_ConvertToSingle:
case NI_SSE_StaticCast:
{
assert(op2 == nullptr);
if (op1Reg != targetReg)
{
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
- emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_SIMD16);
+ emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg);
}
break;
}
- case NI_SSE_LoadAlignedVector128:
- case NI_SSE_LoadScalar:
- case NI_SSE_LoadVector128:
- {
- assert(baseType == TYP_FLOAT);
- assert(op2 == nullptr);
-
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
- emit->emitIns_R_AR(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, 0);
- break;
- }
-
- case NI_SSE_LoadHigh:
- case NI_SSE_LoadLow:
- {
- assert(baseType == TYP_FLOAT);
- op2Reg = op2->gtRegNum;
-
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
- emit->emitIns_SIMD_R_R_AR(ins, targetReg, op1Reg, op2Reg, TYP_SIMD16);
- break;
- }
-
case NI_SSE_MoveMask:
{
assert(baseType == TYP_FLOAT);
assert(op2 == nullptr);
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
- emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_INT);
- break;
- }
-
- case NI_SSE_ReciprocalScalar:
- case NI_SSE_ReciprocalSqrtScalar:
- case NI_SSE_SqrtScalar:
- {
- assert(baseType == TYP_FLOAT);
- assert(op2 == nullptr);
-
- instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
- emit->emitIns_SIMD_R_R_R(ins, targetReg, op1Reg, op1Reg, TYP_SIMD16);
- break;
- }
-
- case NI_SSE_SetAllVector128:
- {
- assert(baseType == TYP_FLOAT);
- assert(op2 == nullptr);
- emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op1Reg, 0, TYP_SIMD16);
+ emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
break;
}
@@ -651,12 +555,12 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
if (op1Reg == targetReg)
{
regNumber tmpReg = node->GetSingleTempReg();
- emit->emitIns_SIMD_R_R(INS_movaps, tmpReg, op1Reg, TYP_SIMD16);
+ emit->emitIns_R_R(INS_movaps, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg);
op1Reg = tmpReg;
}
- emit->emitIns_SIMD_R_R_R(INS_xorps, targetReg, targetReg, targetReg, TYP_SIMD16);
- emit->emitIns_SIMD_R_R_R(INS_movss, targetReg, targetReg, op1Reg, TYP_SIMD16);
+ emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
+ emit->emitIns_SIMD_R_R_R(INS_movss, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg);
break;
}
@@ -665,7 +569,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
assert(baseType == TYP_FLOAT);
assert(op1 == nullptr);
assert(op2 == nullptr);
- emit->emitIns_SIMD_R_R_R(INS_xorps, targetReg, targetReg, targetReg, TYP_SIMD16);
+ emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
break;
}
@@ -699,7 +603,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
if (op3->IsCnsIntOrI())
{
ssize_t ival = op3->AsIntConCommon()->IconValue();
- emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op2Reg, (int)ival, TYP_SIMD16);
+ emit->emitIns_SIMD_R_R_R_I(INS_shufps, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, (int)ival);
}
else
{
@@ -749,7 +653,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
for (unsigned i = 0; i < jmpCount; i++)
{
genDefineTempLabel(jmpTable[i]);
- emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op2Reg, i, TYP_SIMD16);
+ emit->emitIns_SIMD_R_R_R_I(INS_shufps, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, i);
emit->emitIns_J(INS_jmp, switchTableEnd);
}
diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h
index f9ccf7fd8c..ec9c9d70b0 100644
--- a/src/jit/hwintrinsiclistxarch.h
+++ b/src/jit/hwintrinsiclistxarch.h
@@ -15,7 +15,7 @@
1) Each hardware intrinsic has a unique Intrinsic ID with type of `enum NamedIntrinsic`
2) All the overloads of an intrinsic in an ISA class share one Intrinsic ID
3) The intrinsic that generates instructions with a fixed imm8 operand has a `ival` field with "not -1" value, e.g., Sse.CompareEqual(v1,v2) -> cmpps xmm0, xmm1, 0
- 4) SIMD intrinsics have a non-zero `SIMD size` field based-on that operate over `Vector128<T>` (16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_) or `Vector256<T>`
+ 4) SIMD intrinsics have a non-zero `SIMD size` field based-on that operate over `Vector128<T>`(16) or `Vector256<T>`(32)
5) Scalar intrinsics that operate over general purpose registers (e.g., Sse41.Crc32) have `SIMD size` with 0
6) Each intrinsic has a `NumArg` for number of parameters, and some intrinsics that are overloaded on multiple parameter numbers have this field with -1
7) Each intrinsic has 10 `instructions` fields that list the instructions should be generated based-on the base type
@@ -29,92 +29,86 @@
// SSE Intrinsics
HARDWARE_INTRINSIC(SSE_IsSupported, "get_IsSupported", SSE, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_Add, "Add", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
-HARDWARE_INTRINSIC(SSE_AddScalar, "AddScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_AddScalar, "AddScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative)
HARDWARE_INTRINSIC(SSE_And, "And", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
-HARDWARE_INTRINSIC(SSE_AndNot, "AndNot", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Fixed)
-HARDWARE_INTRINSIC(SSE_CompareEqual, "CompareEqual", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar, "CompareEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareEqualScalar, "CompareEqualScalar", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar, "CompareEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_AndNot, "AndNot", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareEqual, "CompareEqual", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar, "CompareEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareEqualScalar, "CompareEqualScalar", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar, "CompareEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
HARDWARE_INTRINSIC(SSE_CompareGreaterThan, "CompareGreaterThan", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar, "CompareGreaterThanScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar, "CompareGreaterThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar, "CompareGreaterThanScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar, "CompareGreaterThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqual, "CompareGreaterThanOrEqual", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar, "CompareGreaterThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar, "CompareGreaterThanOrEqualScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar, "CompareGreaterThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar, "CompareGreaterThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar, "CompareGreaterThanOrEqualScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar, "CompareGreaterThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
HARDWARE_INTRINSIC(SSE_CompareLessThan, "CompareLessThan", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar, "CompareLessThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanScalar, "CompareLessThanScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar, "CompareLessThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar, "CompareLessThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareLessThanScalar, "CompareLessThanScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar, "CompareLessThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqual, "CompareLessThanOrEqual", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar, "CompareLessThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar, "CompareLessThanOrEqualScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar, "CompareLessThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotEqual, "CompareNotEqual", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar, "CompareNotEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar, "CompareNotEqualScalar", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar, "CompareNotEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar, "CompareLessThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar, "CompareLessThanOrEqualScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar, "CompareLessThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareNotEqual, "CompareNotEqual", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar, "CompareNotEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar, "CompareNotEqualScalar", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar, "CompareNotEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment)
HARDWARE_INTRINSIC(SSE_CompareNotGreaterThan, "CompareNotGreaterThan", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar, "CompareNotGreaterThanScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar, "CompareNotGreaterThanScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqual, "CompareNotGreaterThanOrEqual", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar, "CompareNotGreaterThanOrEqualScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar, "CompareNotGreaterThanOrEqualScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_CompareNotLessThan, "CompareNotLessThan", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar, "CompareNotLessThanScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar, "CompareNotLessThanScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqual, "CompareNotLessThanOrEqual", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar, "CompareNotLessThanOrEqualScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar, "CompareNotLessThanOrEqualScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_CompareOrdered, "CompareOrdered", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareOrderedScalar, "CompareOrderedScalar", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareOrderedScalar, "CompareOrderedScalar", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_CompareUnordered, "CompareUnordered", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar, "CompareUnorderedScalar", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToInt32, "ConvertToInt32", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToInt64, "ConvertToInt64", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToSingle, "ConvertToSingle", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar, "ConvertToVector128SingleScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation, "ConvertToInt64WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_LoadScalar, "LoadScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Max, "Max", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Min, "Min", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MinScalar, "MinScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MoveHighToLow, "MoveHighToLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MoveLowToHigh, "MoveLowToHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MoveMask, "MoveMask", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MoveScalar, "MoveScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Multiply, "Multiply", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_MultiplyScalar, "MultiplyScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar, "CompareUnorderedScalar", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt32, "ConvertToInt32", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_ConvertToInt64, "ConvertToInt64", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_ConvertToSingle, "ConvertToSingle", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_Helper, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar, "ConvertToVector128SingleScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss, INS_invalid}, HW_Category_Special, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation, "ConvertToInt64WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadScalar, "LoadScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Max, "Max", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_Min, "Min", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_MinScalar, "MinScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_MoveHighToLow, "MoveHighToLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_MoveLowToHigh, "MoveLowToHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_MoveMask, "MoveMask", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_MoveScalar, "MoveScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE_Multiply, "Multiply", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE_MultiplyScalar, "MultiplyScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative)
HARDWARE_INTRINSIC(SSE_Or, "Or", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_Reciprocal, "Reciprocal", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ReciprocalScalar, "ReciprocalScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ReciprocalScalar, "ReciprocalScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE_ReciprocalSqrt, "ReciprocalSqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalSqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SetAllVector128, "SetAllVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SetScalar, "SetScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SetVector128, "SetVector128", SSE, -1, 16, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SetZeroVector128, "SetZeroVector128", SSE, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalSqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE_SetAllVector128, "SetAllVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(SSE_SetScalar, "SetScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_Helper, HW_Flag_MultiIns)
+HARDWARE_INTRINSIC(SSE_SetVector128, "SetVector128", SSE, -1, 16, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(SSE_SetZeroVector128, "SetZeroVector128", SSE, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_Helper, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_IMM, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_Sqrt, "Sqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_StaticCast, "StaticCast", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_Store, "Store", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_StoreAligned, "StoreAligned", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_StoreHigh, "StoreHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_StoreLow, "StoreLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-//HARDWARE_INTRINSIC(SSE_StoreScalar, "StoreScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE_StaticCast, "StaticCast", SSE, -1, 16, 1, {INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps}, HW_Category_Helper, HW_Flag_TwoTypeGeneric)
HARDWARE_INTRINSIC(SSE_Subtract, "Subtract", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_UnpackHigh, "UnpackHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_UnpackLow, "UnpackLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE_Xor, "Xor", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE_Xor, "Xor", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
// SSE2 Intrinsics
HARDWARE_INTRINSIC(SSE2_IsSupported, "get_IsSupported", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp
index a96952db6c..80091f189f 100644
--- a/src/jit/hwintrinsicxarch.cpp
+++ b/src/jit/hwintrinsicxarch.cpp
@@ -16,7 +16,7 @@ struct HWIntrinsicInfo
int numArgs;
instruction ins[10];
HWIntrinsicCategory category;
- HWIntrinsicFlag flag;
+ HWIntrinsicFlag flags;
};
static const HWIntrinsicInfo hwIntrinsicInfoArray[] = {
@@ -192,7 +192,7 @@ int Compiler::ivalOfHWIntrinsic(NamedIntrinsic intrinsic)
static unsigned simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig)
{
assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
- assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag & HW_Flag_UnfixedSIMDSize) == 0);
+ assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flags & HW_Flag_UnfixedSIMDSize) == 0);
return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize;
}
@@ -248,19 +248,19 @@ HWIntrinsicCategory Compiler::categoryOfHWIntrinsic(NamedIntrinsic intrinsic)
}
//------------------------------------------------------------------------
-// HWIntrinsicFlag: get the flag of the given intrinsic
+// HWIntrinsicFlag: get the flags of the given intrinsic
//
// Arguments:
// intrinsic -- id of the intrinsic function.
//
// Return Value:
-// the flag of the given intrinsic
+// the flags of the given intrinsic
//
-HWIntrinsicFlag Compiler::flagOfHWIntrinsic(NamedIntrinsic intrinsic)
+HWIntrinsicFlag Compiler::flagsOfHWIntrinsic(NamedIntrinsic intrinsic)
{
assert(intrinsic != NI_Illegal);
assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END);
- return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag;
+ return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flags;
}
//------------------------------------------------------------------------
@@ -290,7 +290,7 @@ GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE
assert(varTypeIsArithmetic(argType));
arg = impPopStack().val;
assert(varTypeIsArithmetic(arg->TypeGet()));
- assert(genTypeSize(argType) <= genTypeSize(arg->TypeGet()));
+ assert(genActualType(arg->gtType) == genActualType(argType));
}
return arg;
}
@@ -436,12 +436,10 @@ GenTree* Compiler::impUnsupportedHWIntrinsic(unsigned helper,
// Return Value:
// returns true if this category can be table-driven in the importer
//
-static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category)
+static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags)
{
- // TODO - make more categories to the table-driven framework
- const bool tableDrivenIntrinsic = category == HW_Category_SimpleSIMD;
- const bool nonTableDrivenIntrinsic = category == HW_Category_Special;
- return tableDrivenIntrinsic && !nonTableDrivenIntrinsic;
+ // HW_Flag_NoCodeGen implies this intrinsic should be manually morphed in the importer.
+ return category != HW_Category_Special && category != HW_Category_Scalar && (flags & HW_Flag_NoCodeGen) == 0;
}
//------------------------------------------------------------------------
@@ -462,15 +460,24 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic,
{
InstructionSet isa = isaOfHWIntrinsic(intrinsic);
HWIntrinsicCategory category = categoryOfHWIntrinsic(intrinsic);
+ HWIntrinsicFlag flags = flagsOfHWIntrinsic(intrinsic);
int numArgs = sig->numArgs;
- var_types callType = JITtype2varType(sig->retType);
+ var_types retType = JITtype2varType(sig->retType);
+ var_types baseType = TYP_UNKNOWN;
+ if (retType == TYP_STRUCT && featureSIMD)
+ {
+ unsigned int sizeBytes;
+ baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes);
+ retType = getSIMDTypeForSize(sizeBytes);
+ assert(sizeBytes != 0 && baseType != TYP_UNKNOWN);
+ }
// This intrinsic is supported if
// - the ISA is available on the underlying hardware (compSupports returns true)
// - the compiler supports this hardware intrinsics (compSupportsHWIntrinsic returns true)
// - intrinsics do not require 64-bit registers (r64) on 32-bit platforms (isTypeSupportedForIntrinsic returns
// true)
- bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa) && isTypeSupportedForIntrinsic(callType);
+ bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa) && isTypeSupportedForIntrinsic(retType);
if (category == HW_Category_IsSupportedProperty)
{
@@ -481,22 +488,59 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic,
{
return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
}
+ else if (category == HW_Category_IMM)
+ {
+ GenTree* lastOp = impStackTop().val;
+ if (!lastOp->IsCnsIntOrI() && !mustExpand)
+ {
+ // When the imm-argument is not a constant and we are not being forced to expand, we need to
+ // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The
+ // intrinsic method is recursive and will be forced to expand, at which point
+ // we emit some less efficient fallback code.
+ return nullptr;
+ }
+ }
+
+ if ((flags & HW_Flag_Generic) != 0)
+ {
+ assert(baseType != TYP_UNKNOWN);
+ // When the type argument is not a numeric type (and we are not being forced to expand), we need to
+ // return nullptr so a GT_CALL to the intrinsic method is emitted that will throw NotSupportedException
+ if (!varTypeIsArithmetic(baseType))
+ {
+ assert(!mustExpand);
+ return nullptr;
+ }
+
+ if ((flags & HW_Flag_TwoTypeGeneric) != 0)
+ {
+ // StaticCast<T, U> has two type parameters.
+ assert(!mustExpand);
+ assert(numArgs == 1);
+ var_types srcType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args));
+ assert(srcType != TYP_UNKNOWN);
+ if (!varTypeIsArithmetic(srcType))
+ {
+ return nullptr;
+ }
+ }
+ }
// table-driven importer of simple intrinsics
- if (impIsTableDrivenHWIntrinsic(category))
+ if (impIsTableDrivenHWIntrinsic(category, flags))
{
- unsigned int sizeBytes;
- var_types baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes);
- assert(baseType != TYP_UNKNOWN && sizeBytes != 0);
- var_types retType = getSIMDTypeForSize(sizeBytes);
+ if (!varTypeIsSIMD(retType))
+ {
+ baseType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args));
+ assert(baseType != TYP_UNKNOWN);
+ }
+
unsigned simdSize = simdSizeOfHWIntrinsic(intrinsic, sig);
CORINFO_ARG_LIST_HANDLE argList = sig->args;
CORINFO_CLASS_HANDLE argClass;
var_types argType = TYP_UNKNOWN;
- assert(numArgs > 0);
- assert(retType != TYP_UNDEF);
- assert(retType == TYP_SIMD16 || retType == TYP_SIMD32);
+ assert(numArgs >= 0);
assert(insOfHWIntrinsic(intrinsic, baseType) != INS_invalid);
assert(simdSize == 32 || simdSize == 16);
@@ -506,10 +550,12 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic,
switch (numArgs)
{
+ case 0:
+ retNode = gtNewSimdHWIntrinsicNode(retType, intrinsic, baseType, simdSize);
+ break;
case 1:
argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
op1 = getArgForHWIntrinsic(argType, argClass);
-
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
break;
case 2:
@@ -537,8 +583,7 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic,
argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
op1 = getArgForHWIntrinsic(argType, argClass);
- op1 = gtNewArgList(op1, op2, op3);
- retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+ retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, intrinsic, baseType, simdSize);
break;
}
default:
@@ -653,11 +698,13 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic,
CORINFO_SIG_INFO* sig,
bool mustExpand)
{
- GenTree* retNode = nullptr;
- GenTree* op1 = nullptr;
- GenTree* op2 = nullptr;
- GenTree* op3 = nullptr;
- GenTree* op4 = nullptr;
+ GenTree* retNode = nullptr;
+ GenTree* op1 = nullptr;
+ GenTree* op2 = nullptr;
+ GenTree* op3 = nullptr;
+ GenTree* op4 = nullptr;
+ int simdSize = simdSizeOfHWIntrinsic(intrinsic, sig);
+ assert(simdSize == 16);
switch (intrinsic)
{
@@ -671,112 +718,14 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic,
op2 = impPopStack().val;
op1 = impPopStack().val;
- GenTree* left = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op4, op3, NI_SSE_UnpackLow, TYP_FLOAT, 16);
- GenTree* right = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, op1, NI_SSE_UnpackLow, TYP_FLOAT, 16);
+ GenTree* left = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op4, op3, NI_SSE_UnpackLow, TYP_FLOAT, simdSize);
+ GenTree* right = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, op1, NI_SSE_UnpackLow, TYP_FLOAT, simdSize);
GenTree* control = gtNewIconNode(68, TYP_UBYTE);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, left, right, control, NI_SSE_Shuffle, TYP_FLOAT, 16);
+ retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, left, right, control, NI_SSE_Shuffle, TYP_FLOAT, simdSize);
break;
}
- case NI_SSE_Shuffle:
- {
- assert(sig->numArgs == 3);
- assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
-
- op3 = impStackTop().val;
-
- if (op3->IsCnsIntOrI() || mustExpand)
- {
- impPopStack(); // Pop the value we peeked at
- op2 = impSIMDPopStack(TYP_SIMD16);
- op1 = impSIMDPopStack(TYP_SIMD16);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, op3, intrinsic, TYP_FLOAT, 16);
- }
- else
- {
- // When op3 is not a constant and we are not being forced to expand, we need to
- // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The
- // intrinsic method is recursive and will be forced to expand, at which point
- // we emit some less efficient fallback code.
-
- return nullptr;
- }
- break;
- }
-
- case NI_SSE_Add:
- case NI_SSE_AddScalar:
- case NI_SSE_And:
- case NI_SSE_AndNot:
- case NI_SSE_CompareEqual:
- case NI_SSE_CompareEqualScalar:
- case NI_SSE_CompareGreaterThan:
- case NI_SSE_CompareGreaterThanScalar:
- case NI_SSE_CompareGreaterThanOrEqual:
- case NI_SSE_CompareGreaterThanOrEqualScalar:
- case NI_SSE_CompareLessThan:
- case NI_SSE_CompareLessThanScalar:
- case NI_SSE_CompareLessThanOrEqual:
- case NI_SSE_CompareLessThanOrEqualScalar:
- case NI_SSE_CompareNotEqual:
- case NI_SSE_CompareNotEqualScalar:
- case NI_SSE_CompareNotGreaterThan:
- case NI_SSE_CompareNotGreaterThanScalar:
- case NI_SSE_CompareNotGreaterThanOrEqual:
- case NI_SSE_CompareNotGreaterThanOrEqualScalar:
- case NI_SSE_CompareNotLessThan:
- case NI_SSE_CompareNotLessThanScalar:
- case NI_SSE_CompareNotLessThanOrEqual:
- case NI_SSE_CompareNotLessThanOrEqualScalar:
- case NI_SSE_CompareOrdered:
- case NI_SSE_CompareOrderedScalar:
- case NI_SSE_CompareUnordered:
- case NI_SSE_CompareUnorderedScalar:
- case NI_SSE_Divide:
- case NI_SSE_DivideScalar:
- case NI_SSE_Max:
- case NI_SSE_MaxScalar:
- case NI_SSE_Min:
- case NI_SSE_MinScalar:
- case NI_SSE_MoveHighToLow:
- case NI_SSE_MoveLowToHigh:
- case NI_SSE_MoveScalar:
- case NI_SSE_Multiply:
- case NI_SSE_MultiplyScalar:
- case NI_SSE_Or:
- case NI_SSE_Subtract:
- case NI_SSE_SubtractScalar:
- case NI_SSE_UnpackHigh:
- case NI_SSE_UnpackLow:
- case NI_SSE_Xor:
- assert(sig->numArgs == 2);
- assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
- op2 = impSIMDPopStack(TYP_SIMD16);
- op1 = impSIMDPopStack(TYP_SIMD16);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16);
- break;
-
- case NI_SSE_CompareEqualOrderedScalar:
- case NI_SSE_CompareEqualUnorderedScalar:
- case NI_SSE_CompareGreaterThanOrderedScalar:
- case NI_SSE_CompareGreaterThanUnorderedScalar:
- case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
- case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
- case NI_SSE_CompareLessThanOrderedScalar:
- case NI_SSE_CompareLessThanUnorderedScalar:
- case NI_SSE_CompareLessThanOrEqualOrderedScalar:
- case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
- case NI_SSE_CompareNotEqualOrderedScalar:
- case NI_SSE_CompareNotEqualUnorderedScalar:
- assert(sig->numArgs == 2);
- assert(JITtype2varType(sig->retType) == TYP_BOOL);
- assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT);
- op2 = impSIMDPopStack(TYP_SIMD16);
- op1 = impSIMDPopStack(TYP_SIMD16);
- retNode = gtNewSimdHWIntrinsicNode(TYP_BOOL, op1, op2, intrinsic, TYP_FLOAT, 16);
- break;
-
case NI_SSE_ConvertToVector128SingleScalar:
{
assert(sig->numArgs == 2);
@@ -797,18 +746,7 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic,
op2 = impPopStack().val;
op1 = impSIMDPopStack(TYP_SIMD16);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16);
- break;
- }
-
- case NI_SSE_LoadHigh:
- case NI_SSE_LoadLow:
- {
- assert(sig->numArgs == 2);
- assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
- op2 = impPopStack().val;
- op1 = impSIMDPopStack(TYP_SIMD16);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16);
+ retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, simdSize);
break;
}
@@ -817,77 +755,15 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic,
assert(JITtype2varType(sig->retType) == TYP_INT);
assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT);
op1 = impSIMDPopStack(TYP_SIMD16);
- retNode = gtNewSimdHWIntrinsicNode(TYP_INT, op1, intrinsic, TYP_FLOAT, 16);
- break;
-
- case NI_SSE_StaticCast:
- {
- assert(sig->numArgs == 1);
- var_types tgtType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
- var_types srcType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args));
-
- if (varTypeIsArithmetic(tgtType) && varTypeIsArithmetic(srcType))
- {
- op1 = impSIMDPopStack(TYP_SIMD16);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, tgtType, 16);
- }
- else
- {
- return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
- }
+ retNode = gtNewSimdHWIntrinsicNode(TYP_INT, op1, intrinsic, TYP_FLOAT, simdSize);
break;
- }
- case NI_SSE_LoadAlignedVector128:
- case NI_SSE_LoadScalar:
- case NI_SSE_LoadVector128:
case NI_SSE_SetAllVector128:
- case NI_SSE_SetScalar:
assert(sig->numArgs == 1);
assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
op1 = impPopStack().val;
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, TYP_FLOAT, 16);
- break;
-
- case NI_SSE_Reciprocal:
- case NI_SSE_ReciprocalScalar:
- case NI_SSE_ReciprocalSqrt:
- case NI_SSE_ReciprocalSqrtScalar:
- case NI_SSE_Sqrt:
- case NI_SSE_SqrtScalar:
- assert(sig->numArgs == 1);
- assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
- op1 = impSIMDPopStack(TYP_SIMD16);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, TYP_FLOAT, 16);
- break;
-
- case NI_SSE_ConvertToInt32:
- case NI_SSE_ConvertToInt32WithTruncation:
- case NI_SSE_ConvertToInt64:
- case NI_SSE_ConvertToInt64WithTruncation:
- case NI_SSE_ConvertToSingle:
- {
- assert(sig->numArgs == 1);
- assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT);
- var_types callType = JITtype2varType(sig->retType);
-
-#ifdef _TARGET_X86_
- if (varTypeIsLong(callType))
- {
- assert(intrinsic == NI_SSE_ConvertToInt64 || intrinsic == NI_SSE_ConvertToInt64WithTruncation);
- return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
- }
-#endif // _TARGET_X86_
-
- op1 = impSIMDPopStack(TYP_SIMD16);
- retNode = gtNewSimdHWIntrinsicNode(callType, op1, intrinsic, TYP_FLOAT, 16);
- break;
- }
-
- case NI_SSE_SetZeroVector128:
- assert(sig->numArgs == 0);
- assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, intrinsic, TYP_FLOAT, 16);
+ retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtCloneExpr(op1), gtNewIconNode(0), NI_SSE_Shuffle,
+ TYP_FLOAT, simdSize);
break;
default:
diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h
index f48a6ce6ef..1b8533248e 100644
--- a/src/jit/instrsxarch.h
+++ b/src/jit/instrsxarch.h
@@ -252,7 +252,6 @@ INST3( andnpd, "andnpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x55)) /
INST3( orps, "orps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x56)) // Or packed singles
INST3( orpd, "orpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x56)) // Or packed doubles
INST3( haddpd, "haddpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x7C)) // Horizontal add packed doubles
-INST3( rcpps, "rcpps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53)) // Reciprocals of Packed Singles
// SSE 2 approx arith
INST3( rcpps, "rcpps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53)) // Reciprocal of packed singles
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 4169b4a410..a046704c91 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -2305,14 +2305,17 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
{
NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
- HWIntrinsicCategory category = comp->categoryOfHWIntrinsic(intrinsicID);
- int numArgs = comp->numArgsOfHWIntrinsic(intrinsicID);
+ HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID);
+ HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID);
+ int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID);
GenTree* op1 = node->gtGetOp1();
GenTree* op2 = node->gtGetOp2();
// TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
// TODO-XArch-CQ: Non-VEX encoded instructions require memory ops to be aligned
- if (category == HW_Category_SimpleSIMD && numArgs == 2 && comp->canUseVexEncoding())
+
+ if (comp->canUseVexEncoding() && numArgs == 2 && (flags & HW_Flag_NoContainment) == 0 &&
+ category == HW_Category_SimpleSIMD)
{
if (IsContainableMemoryOp(op2))
{
@@ -2325,7 +2328,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
}
}
- if (NamedIntrinsic == NI_SSE_Shuffle)
+ // TODO - change to all IMM intrinsics
+ if (intrinsicID == NI_SSE_Shuffle)
{
assert(op1->OperIsList());
GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current();
diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp
index f56a36a7d0..fbadd566c8 100644
--- a/src/jit/lsraxarch.cpp
+++ b/src/jit/lsraxarch.cpp
@@ -2564,7 +2564,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree,
info->internalIntCount = 2;
info->setInternalCandidates(this, allRegs(TYP_INT));
- break;
}
break;
}
@@ -2577,7 +2576,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree,
break;
case NI_SSE41_BlendVariable:
- {
if (!compiler->canUseVexEncoding())
{
// SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
@@ -2589,7 +2587,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree,
info->hasDelayFreeSrc = true;
}
break;
- }
#ifdef _TARGET_X86_
case NI_SSE42_Crc32:
diff --git a/src/jit/namedintrinsiclist.h b/src/jit/namedintrinsiclist.h
index 8d5aac28f3..6387f60cbe 100644
--- a/src/jit/namedintrinsiclist.h
+++ b/src/jit/namedintrinsiclist.h
@@ -39,18 +39,33 @@ enum HWIntrinsicFlag : unsigned int
// Generic
// - must throw NotSupportException if the type argument is not numeric type
HW_Flag_Generic = 0x4,
+ // Two-type Generic
+ // - the intrinsic has two type parameters
+ HW_Flag_TwoTypeGeneric = 0xC,
// NoCodeGen
// - should be transformed in the compiler front-end, cannot reach CodeGen
- HW_Flag_NoCodeGen = 0x8,
+ HW_Flag_NoCodeGen = 0x10,
// Unfixed SIMD-size
// - overloaded on multiple vector sizes (SIMD size in the table is unreliable)
- HW_Flag_UnfixedSIMDSize = 0x10,
+ HW_Flag_UnfixedSIMDSize = 0x20,
// Complex overload
// - the codegen of overloads cannot be determined by intrinsicID and base type
- HW_Flag_ComplexOverloads = 0x20,
+ HW_Flag_ComplexOverloads = 0x40,
+
+ // Multi-instruction
+ // - that one intrinsic can generate multiple instructions
+ HW_Flag_MultiIns = 0x80,
+
+ // NoContainment
+ // the intrinsic cannot be contained
+ HW_Flag_NoContainment = 0x100,
+
+ // Copy Upper bits
+ // some SIMD scalar intrinsics need the semantics of copying upper bits from the source operand
+ HW_Flag_CopyUpperBits = 0x200,
};
inline HWIntrinsicFlag operator|(HWIntrinsicFlag c1, HWIntrinsicFlag c2)
@@ -63,7 +78,6 @@ enum HWIntrinsicCategory : unsigned int
// Simple SIMD intrinsics
// - take Vector128/256<T> parameters
// - return a Vector128/256<T>
- // - generate single instruction
// - the codegen of overloads can be determined by intrinsicID and base type of returned vector
HW_Category_SimpleSIMD,
@@ -79,6 +93,10 @@ enum HWIntrinsicCategory : unsigned int
// - operate over general purpose registers, like crc32, lzcnt, popcnt, etc.
HW_Category_Scalar,
+ // SIMD scalar
+ // - operate over vector registers(XMM), but just compute on the first element
+ HW_Category_SIMDScalar,
+
// Memory access intrinsics
// - e.g., Avx.Load, Avx.Store, Sse.LoadAligned
HW_Category_MemoryLoad,