summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarol Eidt <carol.eidt@microsoft.com>2019-03-26 16:13:40 -0700
committerGitHub <noreply@github.com>2019-03-26 16:13:40 -0700
commitda6ed1197abcb1be420351af1bb3758de6048c8f (patch)
treeec26f10d197a21d7f3c80478da6fb728abd8aa31
parentaa072b639fc2eb0e60a8083e4c74426db91341e0 (diff)
downloadcoreclr-da6ed1197abcb1be420351af1bb3758de6048c8f.tar.gz
coreclr-da6ed1197abcb1be420351af1bb3758de6048c8f.tar.bz2
coreclr-da6ed1197abcb1be420351af1bb3758de6048c8f.zip
Handle addressing modes for HW intrinsics (#22944)
* Handle addressing modes for HW intrinsics Also, eliminate some places where the code size estimates were over-estimating. Contribute to #19550 Fix #19521
-rw-r--r--src/jit/codegen.h5
-rw-r--r--src/jit/codegencommon.cpp11
-rw-r--r--src/jit/codegenlinear.cpp77
-rw-r--r--src/jit/emitxarch.cpp37
-rw-r--r--src/jit/emitxarch.h3
-rw-r--r--src/jit/gentree.cpp20
-rw-r--r--src/jit/hwintrinsiccodegenxarch.cpp265
-rw-r--r--src/jit/hwintrinsiclistxarch.h86
-rw-r--r--src/jit/hwintrinsicxarch.cpp9
-rw-r--r--src/jit/instr.cpp121
-rw-r--r--src/jit/lower.h1
-rw-r--r--src/jit/lowerxarch.cpp69
-rw-r--r--src/jit/lsra.h1
-rw-r--r--src/jit/lsrabuild.cpp11
-rw-r--r--src/jit/lsraxarch.cpp22
-rw-r--r--tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.cs143
-rw-r--r--tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.csproj16
17 files changed, 650 insertions, 247 deletions
diff --git a/src/jit/codegen.h b/src/jit/codegen.h
index 046addff4c..1631480853 100644
--- a/src/jit/codegen.h
+++ b/src/jit/codegen.h
@@ -95,6 +95,7 @@ private:
static bool genShouldRoundFP();
GenTreeIndir indirForm(var_types type, GenTree* base);
+ GenTreeStoreInd storeIndirForm(var_types type, GenTree* base, GenTree* data);
GenTreeIntCon intForm(var_types type, ssize_t value);
@@ -1040,6 +1041,9 @@ protected:
void genConsumeRegs(GenTree* tree);
void genConsumeOperands(GenTreeOp* tree);
+#ifdef FEATURE_HW_INTRINSICS
+ void genConsumeHWIntrinsicOperands(GenTreeHWIntrinsic* tree);
+#endif // FEATURE_HW_INTRINSICS
void genEmitGSCookieCheck(bool pushReg);
void genSetRegToIcon(regNumber reg, ssize_t val, var_types type = TYP_INT, insFlags flags = INS_FLAGS_DONT_CARE);
void genCodeForShift(GenTree* tree);
@@ -1309,6 +1313,7 @@ public:
#if defined(_TARGET_XARCH_)
void inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regNumber reg2, unsigned ival);
+ void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival);
#endif
void inst_RV_RR(instruction ins, emitAttr size, regNumber reg1, regNumber reg2);
diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index 9efa517edb..8f31debc9d 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -11317,6 +11317,17 @@ GenTreeIndir CodeGen::indirForm(var_types type, GenTree* base)
}
//------------------------------------------------------------------------
+// indirForm: Make a temporary indir we can feed to pattern matching routines
+// in cases where we don't want to instantiate all the indirs that happen.
+//
+GenTreeStoreInd CodeGen::storeIndirForm(var_types type, GenTree* base, GenTree* data)
+{
+ GenTreeStoreInd i(type, base, data);
+ i.gtRegNum = REG_NA;
+ return i;
+}
+
+//------------------------------------------------------------------------
// intForm: Make a temporary int we can feed to pattern matching routines
// in cases where we don't want to instantiate.
//
diff --git a/src/jit/codegenlinear.cpp b/src/jit/codegenlinear.cpp
index 7c5c01895e..72f9fa68db 100644
--- a/src/jit/codegenlinear.cpp
+++ b/src/jit/codegenlinear.cpp
@@ -1339,12 +1339,27 @@ void CodeGen::genConsumeRegs(GenTree* tree)
// Update the life of the lcl var.
genUpdateLife(tree);
}
-#endif // _TARGET_XARCH_
- else if (tree->OperIsInitVal())
+#ifdef FEATURE_HW_INTRINSICS
+ else if (tree->OperIs(GT_HWIntrinsic))
{
- genConsumeReg(tree->gtGetOp1());
+ // Only load/store HW intrinsics can be contained (and the address may also be contained).
+ HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(tree->AsHWIntrinsic()->gtHWIntrinsicId);
+ assert((category == HW_Category_MemoryLoad) || (category == HW_Category_MemoryStore));
+ int numArgs = HWIntrinsicInfo::lookupNumArgs(tree->AsHWIntrinsic());
+ genConsumeAddress(tree->gtGetOp1());
+ if (category == HW_Category_MemoryStore)
+ {
+ assert((numArgs == 2) && !tree->gtGetOp2()->isContained());
+ genConsumeReg(tree->gtGetOp2());
+ }
+ else
+ {
+ assert(numArgs == 1);
+ }
}
- else if (tree->OperIsHWIntrinsic())
+#endif // FEATURE_HW_INTRINSICS
+#endif // _TARGET_XARCH_
+ else if (tree->OperIsInitVal())
{
genConsumeReg(tree->gtGetOp1());
}
@@ -1374,11 +1389,6 @@ void CodeGen::genConsumeRegs(GenTree* tree)
// Return Value:
// None.
//
-// Notes:
-// Note that this logic is localized here because we must do the liveness update in
-// the correct execution order. This is important because we may have two operands
-// that involve the same lclVar, and if one is marked "lastUse" we must handle it
-// after the first.
void CodeGen::genConsumeOperands(GenTreeOp* tree)
{
@@ -1395,6 +1405,55 @@ void CodeGen::genConsumeOperands(GenTreeOp* tree)
}
}
+#ifdef FEATURE_HW_INTRINSICS
+//------------------------------------------------------------------------
+// genConsumeHWIntrinsicOperands: Do liveness update for the operands of a GT_HWIntrinsic node
+//
+// Arguments:
+// node - the GenTreeHWIntrinsic node whose operands will have their liveness updated.
+//
+// Return Value:
+// None.
+//
+
+void CodeGen::genConsumeHWIntrinsicOperands(GenTreeHWIntrinsic* node)
+{
+ int numArgs = HWIntrinsicInfo::lookupNumArgs(node);
+ GenTree* op1 = node->gtGetOp1();
+ if (op1 == nullptr)
+ {
+ assert((numArgs == 0) && (node->gtGetOp2() == nullptr));
+ return;
+ }
+ if (op1->OperIs(GT_LIST))
+ {
+ int foundArgs = 0;
+ assert(node->gtGetOp2() == nullptr);
+ for (GenTreeArgList* list = op1->AsArgList(); list != nullptr; list = list->Rest())
+ {
+ GenTree* operand = list->Current();
+ genConsumeRegs(operand);
+ foundArgs++;
+ }
+ assert(foundArgs == numArgs);
+ }
+ else
+ {
+ genConsumeRegs(op1);
+ GenTree* op2 = node->gtGetOp2();
+ if (op2 != nullptr)
+ {
+ genConsumeRegs(op2);
+ assert(numArgs == 2);
+ }
+ else
+ {
+ assert(numArgs == 1);
+ }
+ }
+}
+#endif // FEATURE_HW_INTRINSICS
+
#if FEATURE_PUT_STRUCT_ARG_STK
//------------------------------------------------------------------------
// genConsumePutStructArgStk: Do liveness update for the operands of a PutArgStk node.
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp
index 8bba1f688a..e1365500e6 100644
--- a/src/jit/emitxarch.cpp
+++ b/src/jit/emitxarch.cpp
@@ -2849,6 +2849,12 @@ void emitter::emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, G
id->idReg1(dstReg);
emitHandleMemOp(mem, id, IF_RWR_ARD, ins);
UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
+ if (Is4ByteSSEInstruction(ins))
+ {
+ // The 4-Byte SSE instructions require an additional byte.
+ sz += 1;
+ }
+
id->idCodeSize(sz);
dispIns(id);
emitCurIGsize += sz;
@@ -4037,6 +4043,12 @@ void emitter::emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTre
emitHandleMemOp(indir, id, IF_RRW_ARD, ins);
UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
+ if (Is4ByteSSEInstruction(ins))
+ {
+ // The 4-Byte SSE instructions require an additional byte.
+ sz += 1;
+ }
+
id->idCodeSize(sz);
dispIns(id);
@@ -4088,8 +4100,8 @@ void emitter::emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, reg
if (Is4ByteSSEInstruction(ins))
{
- // The 4-Byte SSE instructions require two additional bytes
- sz += 2;
+ // The 4-Byte SSE instructions require an additional byte.
+ sz += 1;
}
id->idCodeSize(sz);
@@ -5165,8 +5177,8 @@ void emitter::emitIns_R_AR(instruction ins, emitAttr attr, regNumber ireg, regNu
if (Is4ByteSSEInstruction(ins))
{
- // The 4-Byte SSE instructions require two additional bytes
- sz += 2;
+ // The 4-Byte SSE instructions require an additional byte.
+ sz += 1;
}
id->idCodeSize(sz);
@@ -5640,7 +5652,7 @@ void emitter::emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNu
#ifdef FEATURE_HW_INTRINSICS
//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_I: emits the code for a SIMD instruction that takes a register operand, an immediate operand
+// emitIns_SIMD_R_R_I: emits the code for an instruction that takes a register operand, an immediate operand
// and that returns a value in register
//
// Arguments:
@@ -5650,6 +5662,13 @@ void emitter::emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNu
// op1Reg -- The register of the first operand
// ival -- The immediate value
//
+// Notes:
+// This will handle the required register copy if 'op1Reg' and 'targetReg' are not the same, and
+// the 3-operand format is not available.
+// This is not really SIMD-specific, but is currently only used in that context, as that's
+// where we frequently need to handle the case of generating 3-operand or 2-operand forms
+// depending on what target ISA is supported.
+//
void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival)
{
if (UseVEXEncoding() || IsDstSrcImmAvxInstruction(ins))
@@ -5704,12 +5723,14 @@ void emitter::emitIns_SIMD_R_R_A(
// targetReg -- The target register
// op1Reg -- The register of the first operand
// base -- The base register used for the memory address
+// offset -- The memory offset
//
-void emitter::emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base)
+void emitter::emitIns_SIMD_R_R_AR(
+ instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset)
{
if (UseVEXEncoding())
{
- emitIns_R_R_AR(ins, attr, targetReg, op1Reg, base, 0);
+ emitIns_R_R_AR(ins, attr, targetReg, op1Reg, base, offset);
}
else
{
@@ -5717,7 +5738,7 @@ void emitter::emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber targ
{
emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
}
- emitIns_R_AR(ins, attr, targetReg, base, 0);
+ emitIns_R_AR(ins, attr, targetReg, base, offset);
}
}
diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h
index 5b06838b07..bad81b7cbb 100644
--- a/src/jit/emitxarch.h
+++ b/src/jit/emitxarch.h
@@ -449,7 +449,8 @@ void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg,
void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival);
void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir);
-void emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base);
+void emitIns_SIMD_R_R_AR(
+ instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset);
void emitIns_SIMD_R_R_C(
instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs);
void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg);
diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp
index 623a98c22c..2b6520682c 100644
--- a/src/jit/gentree.cpp
+++ b/src/jit/gentree.cpp
@@ -3510,6 +3510,26 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
costSz = 2 * 2;
break;
+#if defined(FEATURE_HW_INTRINSICS) && defined(_TARGET_XARCH_)
+ case GT_HWIntrinsic:
+ {
+ if (tree->AsHWIntrinsic()->OperIsMemoryLoadOrStore())
+ {
+ costEx = IND_COST_EX;
+ costSz = 2;
+ // See if we can form a complex addressing mode.
+
+ GenTree* addr = op1->gtEffectiveVal();
+
+ if (addr->OperIs(GT_ADD) && gtMarkAddrMode(addr, &costEx, &costSz, tree->TypeGet()))
+ {
+ goto DONE;
+ }
+ }
+ }
+ break;
+#endif // FEATURE_HW_INTRINSICS && _TARGET_XARCH_
+
case GT_BLK:
case GT_IND:
diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp
index 9660ae346e..9eada1ef9d 100644
--- a/src/jit/hwintrinsiccodegenxarch.cpp
+++ b/src/jit/hwintrinsiccodegenxarch.cpp
@@ -109,32 +109,50 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
{
case 1:
{
- genConsumeOperands(node);
- op1Reg = op1->gtRegNum;
-
if (node->OperIsMemoryLoad())
{
- emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
- }
- else if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
- {
- emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
- }
- else if ((ival != -1) && varTypeIsFloating(baseType))
- {
- assert((ival >= 0) && (ival <= 127));
- genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
+ genConsumeAddress(op1);
+ // Until we improve the handling of addressing modes in the emitter, we'll create a
+ // temporary GT_IND to generate code with.
+ GenTreeIndir load = indirForm(node->TypeGet(), op1);
+ emit->emitInsLoadInd(ins, simdSize, node->gtRegNum, &load);
}
else
{
- genHWIntrinsic_R_RM(node, ins, simdSize);
+ genConsumeRegs(op1);
+ op1Reg = op1->gtRegNum;
+
+ if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
+ {
+ emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
+ }
+ else if ((ival != -1) && varTypeIsFloating(baseType))
+ {
+ assert((ival >= 0) && (ival <= 127));
+ genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
+ }
+ else
+ {
+ genHWIntrinsic_R_RM(node, ins, simdSize);
+ }
}
break;
}
case 2:
{
- genConsumeOperands(node);
+ if (category == HW_Category_MemoryStore)
+ {
+ genConsumeAddress(op1);
+ genConsumeReg(op2);
+ // Until we improve the handling of addressing modes in the emitter, we'll create a
+ // temporary GT_STORE_IND to generate code with.
+ GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
+ emit->emitInsStoreInd(ins, simdSize, &store);
+ break;
+ }
+ genConsumeRegs(op1);
+ genConsumeRegs(op2);
op1Reg = op1->gtRegNum;
op2Reg = op2->gtRegNum;
@@ -153,25 +171,30 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
op1Reg = targetReg;
}
- if (category == HW_Category_MemoryStore)
- {
- emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
- }
- else if ((ival != -1) && varTypeIsFloating(baseType))
+ if ((ival != -1) && varTypeIsFloating(baseType))
{
assert((ival >= 0) && (ival <= 127));
genHWIntrinsic_R_R_RM_I(node, ins, ival);
}
else if (category == HW_Category_MemoryLoad)
{
+ // Get the address and the 'other' register.
+ GenTree* addr;
+ regNumber otherReg;
if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
{
- emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op2Reg, op1Reg);
+ addr = op1;
+ otherReg = op2Reg;
}
else
{
- emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
+ addr = op2;
+ otherReg = op1Reg;
}
+ // Until we improve the handling of addressing modes in the emitter, we'll create a
+ // temporary GT_IND to generate code with.
+ GenTreeIndir load = indirForm(node->TypeGet(), addr);
+ genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load);
}
else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
{
@@ -210,10 +233,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
case 3:
{
- assert(op1->OperIsList());
- assert(op1->gtGetOp2()->OperIsList());
- assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
-
GenTreeArgList* argList = op1->AsArgList();
op1 = argList->Current();
genConsumeRegs(op1);
@@ -520,99 +539,8 @@ void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, i
{
assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
-
- TempDsc* tmpDsc = nullptr;
- unsigned varNum = BAD_VAR_NUM;
- unsigned offset = (unsigned)-1;
-
- if (op1->isUsedFromSpillTemp())
- {
- assert(op1->IsRegOptional());
-
- tmpDsc = getSpillTempDsc(op1);
- varNum = tmpDsc->tdTempNum();
- offset = 0;
-
- regSet.tmpRlsTemp(tmpDsc);
- }
- else if (op1->OperIsHWIntrinsic())
- {
- emit->emitIns_R_AR_I(ins, simdSize, targetReg, op1->gtGetOp1()->gtRegNum, 0, ival);
- return;
- }
- else if (op1->isIndir())
- {
- GenTreeIndir* memIndir = op1->AsIndir();
- GenTree* memBase = memIndir->gtOp1;
-
- switch (memBase->OperGet())
- {
- case GT_LCL_VAR_ADDR:
- {
- varNum = memBase->AsLclVarCommon()->GetLclNum();
- offset = 0;
-
- // Ensure that all the GenTreeIndir values are set to their defaults.
- assert(!memIndir->HasIndex());
- assert(memIndir->Scale() == 1);
- assert(memIndir->Offset() == 0);
-
- break;
- }
-
- case GT_CLS_VAR_ADDR:
- {
- emit->emitIns_R_C_I(ins, simdSize, targetReg, memBase->gtClsVar.gtClsVarHnd, 0, ival);
- return;
- }
-
- default:
- {
- emit->emitIns_R_A_I(ins, simdSize, targetReg, memIndir, ival);
- return;
- }
- }
- }
- else
- {
- switch (op1->OperGet())
- {
- case GT_LCL_FLD:
- {
- GenTreeLclFld* lclField = op1->AsLclFld();
-
- varNum = lclField->GetLclNum();
- offset = lclField->gtLclFld.gtLclOffs;
- break;
- }
-
- case GT_LCL_VAR:
- {
- assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
- varNum = op1->AsLclVar()->GetLclNum();
- offset = 0;
- break;
- }
-
- default:
- unreached();
- break;
- }
- }
-
- // Ensure we got a good varNum and offset.
- // We also need to check for `tmpDsc != nullptr` since spill temp numbers
- // are negative and start with -1, which also happens to be BAD_VAR_NUM.
- assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
- assert(offset != (unsigned)-1);
-
- emit->emitIns_R_S_I(ins, simdSize, targetReg, varNum, offset, ival);
- }
- else
- {
- regNumber op1Reg = op1->gtRegNum;
- emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
}
+ inst_RV_TT_IV(ins, simdSize, targetReg, op1, ival);
}
//------------------------------------------------------------------------
@@ -681,7 +609,11 @@ void CodeGen::genHWIntrinsic_R_R_RM(
}
else if (op2->OperIsHWIntrinsic())
{
- emit->emitIns_SIMD_R_R_AR(ins, attr, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
+ GenTree* addr = op2->gtGetOp1();
+ // Until we improve the handling of addressing modes in the emitter, we'll create a
+ // temporary GT_IND to generate code with.
+ GenTreeIndir load = indirForm(node->TypeGet(), addr);
+ emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, &load);
return;
}
else if (op2->isIndir())
@@ -1267,7 +1199,7 @@ void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsi
// node - The hardware intrinsic node
//
// Note:
-// We currently assume that all base intrinsics only have a single operand.
+// We currently assume that all base intrinsics have zero or one operand.
//
void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
{
@@ -1279,15 +1211,10 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
assert(compiler->compSupports(InstructionSet_SSE));
assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
- GenTree* op1 = node->gtGetOp1();
- regNumber op1Reg = REG_NA;
+ GenTree* op1 = node->gtGetOp1();
- if (op1 != nullptr)
- {
- assert(!op1->OperIsList());
- op1Reg = op1->gtRegNum;
- genConsumeOperands(node);
- }
+ genConsumeHWIntrinsicOperands(node);
+ regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->gtRegNum;
assert(node->gtGetOp2() == nullptr);
@@ -1418,11 +1345,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
regNumber op4Reg = REG_NA;
emitter* emit = getEmitter();
- if ((op1 != nullptr) && !op1->OperIsList())
- {
- op1Reg = op1->gtRegNum;
- genConsumeOperands(node);
- }
+ genConsumeHWIntrinsicOperands(node);
switch (intrinsicId)
{
@@ -1529,6 +1452,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
assert(op2 == nullptr);
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
+ op1Reg = op1->gtRegNum;
emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
break;
}
@@ -1568,11 +1492,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
regNumber op2Reg = REG_NA;
emitter* emit = getEmitter();
- if ((op1 != nullptr) && !op1->OperIsList())
- {
- op1Reg = op1->gtRegNum;
- genConsumeOperands(node);
- }
+ genConsumeHWIntrinsicOperands(node);
switch (intrinsicId)
{
@@ -1588,6 +1508,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
assert((ival >= 0) && (ival <= 127));
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
+ op1Reg = op1->gtRegNum;
op2Reg = op2->gtRegNum;
emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
@@ -1711,6 +1632,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
if (varTypeIsIntegral(baseType))
{
assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
+ op1Reg = op1->gtRegNum;
emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
}
else
@@ -1748,6 +1670,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
op2Reg = op2->gtRegNum;
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
+ op1Reg = op1->gtRegNum;
emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
break;
}
@@ -1783,16 +1706,13 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
regNumber op4Reg = REG_NA;
emitter* emit = getEmitter();
- if ((op1 != nullptr) && !op1->OperIsList())
- {
- op1Reg = op1->gtRegNum;
- genConsumeOperands(node);
- }
+ genConsumeHWIntrinsicOperands(node);
switch (intrinsicId)
{
case NI_SSE41_TestAllOnes:
{
+ op1Reg = op1->gtRegNum;
regNumber tmpReg = node->GetSingleTempReg();
assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
@@ -1845,12 +1765,12 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
if (baseType == TYP_FLOAT)
{
// extract instructions return to GP-registers, so it needs int size as the emitsize
- emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1Reg, i);
+ inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i);
emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
}
else
{
- emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), targetReg, op1Reg, i);
+ inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i);
}
};
@@ -1896,8 +1816,8 @@ void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
var_types targetType = node->TypeGet();
emitter* emit = getEmitter();
+ genConsumeHWIntrinsicOperands(node);
regNumber op1Reg = op1->gtRegNum;
- genConsumeOperands(node);
assert(targetReg != REG_NA);
assert(op1Reg != REG_NA);
@@ -1966,18 +1886,15 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
regNumber targetReg = node->gtRegNum;
emitter* emit = getEmitter();
- if ((op1 != nullptr) && !op1->OperIsList())
- {
- op1Reg = op1->gtRegNum;
- genConsumeOperands(node);
- }
+ genConsumeHWIntrinsicOperands(node);
switch (intrinsicId)
{
case NI_AVX2_ConvertToInt32:
case NI_AVX2_ConvertToUInt32:
{
- assert(op2 == nullptr);
+ op1Reg = op1->gtRegNum;
+ assert(numArgs == 1);
assert((baseType == TYP_INT) || (baseType == TYP_UINT));
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
@@ -1992,16 +1909,13 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
GenTreeArgList* list = op1->AsArgList();
op1 = list->Current();
op1Reg = op1->gtRegNum;
- genConsumeRegs(op1);
list = list->Rest();
op2 = list->Current();
op2Reg = op2->gtRegNum;
- genConsumeRegs(op2);
list = list->Rest();
GenTree* op3 = list->Current();
- genConsumeRegs(op3);
list = list->Rest();
GenTree* op4 = nullptr;
@@ -2017,12 +1931,11 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
if (numArgs == 5)
{
assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
- op4 = list->Current();
- list = list->Rest();
- lastOp = list->Current();
- op3Reg = op3->gtRegNum;
- op4Reg = op4->gtRegNum;
- genConsumeRegs(op4);
+ op4 = list->Current();
+ list = list->Rest();
+ lastOp = list->Current();
+ op3Reg = op3->gtRegNum;
+ op4Reg = op4->gtRegNum;
addrBaseReg = op2Reg;
addrIndexReg = op3Reg;
indexOp = op3;
@@ -2157,10 +2070,7 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
assert(targetReg != REG_NA);
assert(op1 != nullptr);
- if (!op1->OperIsList())
- {
- genConsumeOperands(node);
- }
+ genConsumeHWIntrinsicOperands(node);
switch (intrinsicId)
{
@@ -2224,16 +2134,13 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
{
GenTreeArgList* argList = op1->AsArgList();
op1 = argList->Current();
- genConsumeRegs(op1);
- op1Reg = op1->gtRegNum;
- argList = argList->Rest();
- op2 = argList->Current();
- genConsumeRegs(op2);
- op2Reg = op2->gtRegNum;
- argList = argList->Rest();
- GenTree* op3 = argList->Current();
- genConsumeRegs(op3);
- op3Reg = op3->gtRegNum;
+ op1Reg = op1->gtRegNum;
+ argList = argList->Rest();
+ op2 = argList->Current();
+ op2Reg = op2->gtRegNum;
+ argList = argList->Rest();
+ GenTree* op3 = argList->Current();
+ op3Reg = op3->gtRegNum;
assert(op3Reg != op1Reg);
assert(op3Reg != targetReg);
assert(op3Reg != REG_EDX);
@@ -2288,22 +2195,16 @@ void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
regNumber targetReg = node->gtRegNum;
assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
- assert(op1 != nullptr);
- assert(op1->OperIsList());
- assert(op1->gtGetOp2()->OperIsList());
- assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
+ genConsumeHWIntrinsicOperands(node);
GenTreeArgList* argList = op1->AsArgList();
op1 = argList->Current();
- genConsumeRegs(op1);
argList = argList->Rest();
GenTree* op2 = argList->Current();
- genConsumeRegs(op2);
argList = argList->Rest();
GenTree* op3 = argList->Current();
- genConsumeRegs(op3);
regNumber op1Reg;
regNumber op2Reg;
diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h
index 60e0f801e2..6434831fdf 100644
--- a/src/jit/hwintrinsiclistxarch.h
+++ b/src/jit/hwintrinsiclistxarch.h
@@ -114,11 +114,11 @@ HARDWARE_INTRINSIC(SSE_ConvertScalarToVector128Single, "ConvertScal
HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_LoadScalarVector128, "LoadScalarVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_LoadScalarVector128, "LoadScalarVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE_Max, "Max", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE_Min, "Min", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
@@ -141,13 +141,13 @@ HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalS
HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(SSE_Sqrt, "Sqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE, -1, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE_Store, "Store", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_StoreAligned, "StoreAligned", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_Store, "Store", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_StoreAligned, "StoreAligned", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE_StoreFence, "StoreFence", SSE, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_StoreHigh, "StoreHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_StoreLow, "StoreLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_StoreScalar, "StoreScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_StoreHigh, "StoreHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_StoreLow, "StoreLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_StoreScalar, "StoreScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE_Subtract, "Subtract", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE_UnpackHigh, "UnpackHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
@@ -227,12 +227,12 @@ HARDWARE_INTRINSIC(SSE2_Divide, "Divide",
HARDWARE_INTRINSIC(SSE2_DivideScalar, "DivideScalar", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE2_Extract, "Extract", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_pextrw, INS_pextrw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE2_Insert, "Insert", SSE2, -1, 16, 3, {INS_invalid, INS_invalid, INS_pinsrw, INS_pinsrw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
-HARDWARE_INTRINSIC(SSE2_LoadAlignedVector128, "LoadAlignedVector128", SSE2, -1, 16, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_invalid, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_LoadAlignedVector128, "LoadAlignedVector128", SSE2, -1, 16, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_invalid, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE2_LoadFence, "LoadFence", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_LoadHigh, "LoadHigh", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhpd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_LoadLow, "LoadLow", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlpd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_LoadScalarVector128, "LoadScalarVector128", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_movd, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_LoadVector128, "LoadVector128", SSE2, -1, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_LoadHigh, "LoadHigh", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhpd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_LoadLow, "LoadLow", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlpd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_LoadScalarVector128, "LoadScalarVector128", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_movd, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_LoadVector128, "LoadVector128", SSE2, -1, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE2_MaskMove, "MaskMove", SSE2, -1, 16, 3, {INS_maskmovdqu, INS_maskmovdqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE2_Max, "Max", SSE2, -1, 16, 2, {INS_invalid, INS_pmaxub, INS_pmaxsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(SSE2_MemoryFence, "MemoryFence", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
@@ -260,13 +260,13 @@ HARDWARE_INTRINSIC(SSE2_ShuffleHigh, "ShuffleHigh
HARDWARE_INTRINSIC(SSE2_ShuffleLow, "ShuffleLow", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_pshuflw, INS_pshuflw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(SSE2_Sqrt, "Sqrt", SSE2, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE2_SqrtScalar, "SqrtScalar", SSE2, -1, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE2_Store, "Store", SSE2, -1, 16, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_StoreAligned, "StoreAligned", SSE2, -1, 16, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_invalid, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE2, -1, 16, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_invalid, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_StoreHigh, "StoreHigh", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_StoreLow, "StoreLow", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_invalid, INS_movlpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_StoreNonTemporal, "StoreNonTemporal", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti, INS_movnti, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
-HARDWARE_INTRINSIC(SSE2_StoreScalar, "StoreScalar", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsdsse2}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_Store, "Store", SSE2, -1, 16, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_invalid, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_StoreAligned, "StoreAligned", SSE2, -1, 16, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_invalid, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE2, -1, 16, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_invalid, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_StoreHigh, "StoreHigh", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_StoreLow, "StoreLow", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_invalid, INS_movlpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_StoreNonTemporal, "StoreNonTemporal", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti, INS_movnti, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(SSE2_StoreScalar, "StoreScalar", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsdsse2}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE2_Subtract, "Subtract", SSE2, -1, 16, 2, {INS_psubb, INS_psubb, INS_psubw, INS_psubw, INS_psubd, INS_psubd, INS_psubq, INS_psubq, INS_invalid, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE2_SubtractSaturate, "SubtractSaturate", SSE2, -1, 16, 2, {INS_psubsb, INS_psubusb, INS_psubsw, INS_psubusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE2_SubtractScalar, "SubtractScalar", SSE2, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
@@ -286,7 +286,7 @@ HARDWARE_INTRINSIC(SSE2_X64_ConvertToUInt64, "ConvertToUI
HARDWARE_INTRINSIC(SSE2_X64_ConvertScalarToVector128Double, "ConvertScalarToVector128Double", SSE2_X64, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg)
HARDWARE_INTRINSIC(SSE2_X64_ConvertScalarToVector128Int64, "ConvertScalarToVector128Int64", SSE2_X64, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov_i2xmm, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(SSE2_X64_ConvertScalarToVector128UInt64, "ConvertScalarToVector128UInt64", SSE2_X64, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov_i2xmm, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
-HARDWARE_INTRINSIC(SSE2_X64_StoreNonTemporal, "StoreNonTemporal", SSE2_X64, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti, INS_movnti, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(SSE2_X64_StoreNonTemporal, "StoreNonTemporal", SSE2_X64, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti, INS_movnti, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags
@@ -297,8 +297,8 @@ HARDWARE_INTRINSIC(SSE3_IsSupported, "get_IsSuppo
HARDWARE_INTRINSIC(SSE3_AddSubtract, "AddSubtract", SSE3, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE3_HorizontalAdd, "HorizontalAdd", SSE3, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE3_HorizontalSubtract, "HorizontalSubtract", SSE3, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE3_LoadAndDuplicateToVector128, "LoadAndDuplicateToVector128", SSE3, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE3_LoadDquVector128, "LoadDquVector128", SSE3, -1, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE3_LoadAndDuplicateToVector128, "LoadAndDuplicateToVector128", SSE3, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE3_LoadDquVector128, "LoadDquVector128", SSE3, -1, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE3_MoveAndDuplicate, "MoveAndDuplicate", SSE3, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE3_MoveHighAndDuplicate, "MoveHighAndDuplicate", SSE3, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE3_MoveLowAndDuplicate, "MoveLowAndDuplicate", SSE3, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
@@ -339,7 +339,7 @@ HARDWARE_INTRINSIC(SSE41_Extract, "Extract",
HARDWARE_INTRINSIC(SSE41_Floor, "Floor", SSE41, 9, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE41_FloorScalar, "FloorScalar", SSE41, 9, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE41_Insert, "Insert", SSE41, -1, 16, 3, {INS_pinsrb, INS_pinsrb, INS_invalid, INS_invalid, INS_pinsrd, INS_pinsrd, INS_invalid, INS_invalid, INS_insertps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
-HARDWARE_INTRINSIC(SSE41_LoadAlignedVector128NonTemporal, "LoadAlignedVector128NonTemporal", SSE41, -1, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE41_LoadAlignedVector128NonTemporal, "LoadAlignedVector128NonTemporal", SSE41, -1, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE41_Max, "Max", SSE41, -1, 16, 2, {INS_pmaxsb, INS_invalid, INS_invalid, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE41_Min, "Min", SSE41, -1, 16, 2, {INS_pminsb, INS_invalid, INS_invalid, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE41_MinHorizontal, "MinHorizontal", SSE41, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
@@ -403,9 +403,9 @@ HARDWARE_INTRINSIC(AVX_AndNot, "AndNot",
HARDWARE_INTRINSIC(AVX_Blend, "Blend", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX_BlendVariable, "BlendVariable", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_Ceiling, "Ceiling", AVX, 10, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector128, "BroadcastScalarToVector128", AVX, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment)
-HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector256, "BroadcastScalarToVector256", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_MemoryLoad, HW_Flag_NoContainment)
-HARDWARE_INTRINSIC(AVX_BroadcastVector128ToVector256, "BroadcastVector128ToVector256", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector128, "BroadcastScalarToVector128", AVX, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector256, "BroadcastScalarToVector256", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_BroadcastVector128ToVector256, "BroadcastVector128ToVector256", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_Compare, "Compare", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_CompareScalar, "CompareScalar", AVX, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(AVX_ConvertToVector128Int32, "ConvertToVector128Int32", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
@@ -424,12 +424,12 @@ HARDWARE_INTRINSIC(AVX_Floor, "Floor",
HARDWARE_INTRINSIC(AVX_HorizontalAdd, "HorizontalAdd", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_HorizontalSubtract, "HorizontalSubtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_InsertVector128, "InsertVector128", AVX, -1, 32, 3, {INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128}, HW_Category_IMM, HW_Flag_FullRangeIMM)
-HARDWARE_INTRINSIC(AVX_LoadAlignedVector256, "LoadAlignedVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_LoadDquVector256, "LoadDquVector256", AVX, -1, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_LoadVector256, "LoadVector256", AVX, -1, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_LoadAlignedVector256, "LoadAlignedVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_LoadDquVector256, "LoadDquVector256", AVX, -1, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_LoadVector256, "LoadVector256", AVX, -1, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_Max, "Max", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX_Min, "Min", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
-HARDWARE_INTRINSIC(AVX_MaskLoad, "MaskLoad", AVX, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(AVX_MaskLoad, "MaskLoad", AVX, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_UnfixedSIMDSize)
HARDWARE_INTRINSIC(AVX_MaskStore, "MaskStore", AVX, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize|HW_Flag_BaseTypeFromSecondArg)
HARDWARE_INTRINSIC(AVX_MoveMask, "MoveMask", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AVX_Multiply, "Multiply", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
@@ -446,9 +446,9 @@ HARDWARE_INTRINSIC(AVX_RoundToPositiveInfinity, "RoundToPosi
HARDWARE_INTRINSIC(AVX_RoundToZero, "RoundToZero", AVX, 11, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_Shuffle, "Shuffle", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_NoRMWSemantics|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX_Sqrt, "Sqrt", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_Store, "Store", AVX, -1, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_StoreAligned, "StoreAligned", AVX, -1, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", AVX, -1, 32, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_Store, "Store", AVX, -1, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_StoreAligned, "StoreAligned", AVX, -1, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", AVX, -1, 32, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_Subtract, "Subtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_TestC, "TestC", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AVX_TestNotZAndNotC, "TestNotZAndNotC", AVX, -1, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
@@ -472,9 +472,9 @@ HARDWARE_INTRINSIC(AVX2_AndNot, "AndNot",
HARDWARE_INTRINSIC(AVX2_Average, "Average", AVX2, -1, 32, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX2_Blend, "Blend", AVX2, -1, 0, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_UnfixedSIMDSize|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX2_BlendVariable, "BlendVariable", AVX2, -1, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector128, "BroadcastScalarToVector128", AVX2, -1, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryLoad)
-HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector256, "BroadcastScalarToVector256", AVX2, -1, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryLoad)
-HARDWARE_INTRINSIC(AVX2_BroadcastVector128ToVector256, "BroadcastVector128ToVector256", AVX2, -1, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector128, "BroadcastScalarToVector128", AVX2, -1, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad)
+HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector256, "BroadcastScalarToVector256", AVX2, -1, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad)
+HARDWARE_INTRINSIC(AVX2_BroadcastVector128ToVector256, "BroadcastVector128ToVector256", AVX2, -1, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_CompareEqual, "CompareEqual", AVX2, -1, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX2_CompareGreaterThan, "CompareGreaterThan", AVX2, -1, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_ExtractVector128, "ExtractVector128", AVX2, -1, 32, 2, {INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
@@ -495,9 +495,9 @@ HARDWARE_INTRINSIC(AVX2_HorizontalAddSaturate, "HorizontalA
HARDWARE_INTRINSIC(AVX2_HorizontalSubtract, "HorizontalSubtract", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_HorizontalSubtractSaturate, "HorizontalSubtractSaturate", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_InsertVector128, "InsertVector128", AVX2, -1, 32, 3, {INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
-HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal, "LoadAlignedVector256NonTemporal", AVX2, -1, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment)
-HARDWARE_INTRINSIC(AVX2_MaskLoad, "MaskLoad", AVX2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize)
-HARDWARE_INTRINSIC(AVX2_MaskStore, "MaskStore", AVX2, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize|HW_Flag_BaseTypeFromSecondArg)
+HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal, "LoadAlignedVector256NonTemporal", AVX2, -1, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX2_MaskLoad, "MaskLoad", AVX2, -1, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(AVX2_MaskStore, "MaskStore", AVX2, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_UnfixedSIMDSize|HW_Flag_BaseTypeFromSecondArg)
HARDWARE_INTRINSIC(AVX2_Max, "Max", AVX2, -1, 32, 2, {INS_pmaxsb, INS_pmaxub, INS_pmaxsw, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX2_Min, "Min", AVX2, -1, 32, 2, {INS_pminsb, INS_pminub, INS_pminsw, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX2_MoveMask, "MoveMask", AVX2, -1, 32, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp
index fcc20e04c6..dc298ed230 100644
--- a/src/jit/hwintrinsicxarch.cpp
+++ b/src/jit/hwintrinsicxarch.cpp
@@ -847,6 +847,15 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
case 1:
argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
op1 = getArgForHWIntrinsic(argType, argClass);
+ if ((category == HW_Category_MemoryLoad) && op1->OperIs(GT_CAST))
+ {
+ // Although the API specifies a pointer, if what we have is a BYREF, that's what
+ // we really want, so throw away the cast.
+ if (op1->gtGetOp1()->TypeGet() == TYP_BYREF)
+ {
+ op1 = op1->gtGetOp1();
+ }
+ }
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
break;
case 2:
diff --git a/src/jit/instr.cpp b/src/jit/instr.cpp
index cbab22be76..72a3fb93bb 100644
--- a/src/jit/instr.cpp
+++ b/src/jit/instr.cpp
@@ -1039,7 +1039,126 @@ void CodeGen::inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regN
getEmitter()->emitIns_R_R_I(ins, size, reg1, reg2, ival);
}
-#endif
+
+#ifdef FEATURE_HW_INTRINSICS
+//------------------------------------------------------------------------
+// inst_RV_TT_IV: Generates an instruction that takes 3 operands:
+// a register operand, an operand that may be memory or register and an immediate
+// and that returns a value in register
+//
+// Arguments:
+// ins -- The instruction being emitted
+// attr -- The emit attribute
+// reg1 -- The first operand, a register
+// rmOp -- The second operand, which may be a memory node or a node producing a register
+// ival -- The immediate operand
+//
+// Notes:
+// This isn't really specific to HW intrinsics, but depends on other methods that are
+// only defined for FEATURE_HW_INTRINSICS, and is currently only used in that context.
+//
+void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival)
+{
+ noway_assert(getEmitter()->emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
+
+ if (rmOp->isContained() || rmOp->isUsedFromSpillTemp())
+ {
+ TempDsc* tmpDsc = nullptr;
+ unsigned varNum = BAD_VAR_NUM;
+ unsigned offset = (unsigned)-1;
+
+ if (rmOp->isUsedFromSpillTemp())
+ {
+ assert(rmOp->IsRegOptional());
+
+ tmpDsc = getSpillTempDsc(rmOp);
+ varNum = tmpDsc->tdTempNum();
+ offset = 0;
+
+ regSet.tmpRlsTemp(tmpDsc);
+ }
+ else if (rmOp->OperIsHWIntrinsic())
+ {
+ getEmitter()->emitIns_R_AR_I(ins, attr, reg1, rmOp->gtGetOp1()->gtRegNum, 0, ival);
+ return;
+ }
+ else if (rmOp->isIndir())
+ {
+ GenTreeIndir* memIndir = rmOp->AsIndir();
+ GenTree* memBase = memIndir->gtOp1;
+
+ switch (memBase->OperGet())
+ {
+ case GT_LCL_VAR_ADDR:
+ {
+ varNum = memBase->AsLclVarCommon()->GetLclNum();
+ offset = 0;
+
+ // Ensure that all the GenTreeIndir values are set to their defaults.
+ assert(!memIndir->HasIndex());
+ assert(memIndir->Scale() == 1);
+ assert(memIndir->Offset() == 0);
+
+ break;
+ }
+
+ case GT_CLS_VAR_ADDR:
+ {
+ getEmitter()->emitIns_R_C_I(ins, attr, reg1, memBase->gtClsVar.gtClsVarHnd, 0, ival);
+ return;
+ }
+
+ default:
+ {
+ getEmitter()->emitIns_R_A_I(ins, attr, reg1, memIndir, ival);
+ return;
+ }
+ }
+ }
+ else
+ {
+ switch (rmOp->OperGet())
+ {
+ case GT_LCL_FLD:
+ {
+ GenTreeLclFld* lclField = rmOp->AsLclFld();
+
+ varNum = lclField->GetLclNum();
+ offset = lclField->gtLclFld.gtLclOffs;
+ break;
+ }
+
+ case GT_LCL_VAR:
+ {
+ assert(rmOp->IsRegOptional() || !compiler->lvaGetDesc(rmOp->gtLclVar.gtLclNum)->lvIsRegCandidate());
+ varNum = rmOp->AsLclVar()->GetLclNum();
+ offset = 0;
+ break;
+ }
+
+ default:
+ unreached();
+ break;
+ }
+ }
+
+ // Ensure we got a good varNum and offset.
+ // We also need to check for `tmpDsc != nullptr` since spill temp numbers
+ // are negative and start with -1, which also happens to be BAD_VAR_NUM.
+ assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
+ assert(offset != (unsigned)-1);
+
+ getEmitter()->emitIns_R_S_I(ins, attr, reg1, varNum, offset, ival);
+ }
+ else
+ {
+ regNumber rmOpReg = rmOp->gtRegNum;
+ getEmitter()->emitIns_SIMD_R_R_I(ins, attr, reg1, rmOpReg, ival);
+ }
+}
+#endif // FEATURE_HW_INTRINSICS
+
+#endif // _TARGET_XARCH_
/*****************************************************************************
*
diff --git a/src/jit/lower.h b/src/jit/lower.h
index e29bb9c4d6..e0a7c64d2f 100644
--- a/src/jit/lower.h
+++ b/src/jit/lower.h
@@ -104,6 +104,7 @@ private:
void ContainCheckSIMD(GenTreeSIMD* simdNode);
#endif // FEATURE_SIMD
#ifdef FEATURE_HW_INTRINSICS
+ void ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree** pAddr);
void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node);
#endif // FEATURE_HW_INTRINSICS
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 292fb93c53..373f881a8a 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -2488,6 +2488,10 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge
switch (category)
{
+ case HW_Category_MemoryLoad:
+ supportsGeneralLoads = (!node->OperIsHWIntrinsic());
+ break;
+
case HW_Category_SimpleSIMD:
{
// These intrinsics only expect 16 or 32-byte nodes for containment
@@ -2664,6 +2668,15 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge
break;
}
+ case NI_AVX2_BroadcastScalarToVector128:
+ case NI_AVX2_BroadcastScalarToVector256:
+ {
+ // The memory form of this already takes a pointer, and cannot be further contained.
+ // The containable form is the one that takes a SIMD value, that may be in memory.
+ supportsGeneralLoads = (node->TypeGet() == TYP_SIMD16);
+ break;
+ }
+
case NI_SSE_ConvertScalarToVector128Single:
case NI_SSE2_ConvertScalarToVector128Double:
case NI_SSE2_ConvertScalarToVector128Int32:
@@ -2782,6 +2795,28 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge
}
//----------------------------------------------------------------------------------------------
+// ContainCheckHWIntrinsicAddr: Perform containment analysis for an address operand of a hardware
+// intrinsic node.
+//
+// Arguments:
+// node - The hardware intrinsic node
+// pAddr - The "parent" pointer to the address operand, so that we can update the operand
+// of the parent as needed.
+//
+void Lowering::ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree** pAddr)
+{
+ assert(((*pAddr)->TypeGet() == TYP_I_IMPL) || ((*pAddr)->TypeGet() == TYP_BYREF));
+ TryCreateAddrMode(LIR::Use(BlockRange(), pAddr, node), true);
+ GenTree* addr = *pAddr;
+ if ((addr->OperIs(GT_CLS_VAR_ADDR, GT_LCL_VAR_ADDR) ||
+ (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp)) || (addr->OperGet() == GT_LEA)) &&
+ IsSafeToContainMem(node, addr))
+ {
+ MakeSrcContained(node, addr);
+ }
+}
+
+//----------------------------------------------------------------------------------------------
// ContainCheckHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
//
// Arguments:
@@ -2800,7 +2835,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
if (!HWIntrinsicInfo::SupportsContainment(intrinsicId))
{
- // AVX2 gather are not contaibable and always have constant IMM argument
+ // AVX2 gather are not containable and always have constant IMM argument
if (HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsicId))
{
GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
@@ -2825,6 +2860,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
switch (category)
{
+ case HW_Category_MemoryLoad:
+ {
+ GenTree** pAddr = &node->gtOp1;
+ ContainCheckHWIntrinsicAddr(node, pAddr);
+ break;
+ }
case HW_Category_SimpleSIMD:
case HW_Category_SIMDScalar:
case HW_Category_Scalar:
@@ -2905,6 +2946,26 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
switch (category)
{
+ case HW_Category_MemoryLoad:
+ {
+ GenTree** pAddr = nullptr;
+ if ((intrinsicId == NI_AVX_MaskLoad) || (intrinsicId == NI_AVX2_MaskLoad))
+ {
+ pAddr = &node->gtOp.gtOp1;
+ }
+ else
+ {
+ pAddr = &node->gtOp.gtOp2;
+ }
+ ContainCheckHWIntrinsicAddr(node, pAddr);
+ break;
+ }
+ case HW_Category_MemoryStore:
+ {
+ GenTree** pAddr = &node->gtOp1;
+ ContainCheckHWIntrinsicAddr(node, pAddr);
+ break;
+ }
case HW_Category_SimpleSIMD:
case HW_Category_SIMDScalar:
case HW_Category_Scalar:
@@ -3113,6 +3174,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
switch (category)
{
+ case HW_Category_MemoryStore:
+ {
+ GenTree** pAddr = &node->gtOp.gtOp1->gtOp.gtOp1;
+ ContainCheckHWIntrinsicAddr(node, pAddr);
+ break;
+ }
case HW_Category_SimpleSIMD:
case HW_Category_SIMDScalar:
case HW_Category_Scalar:
diff --git a/src/jit/lsra.h b/src/jit/lsra.h
index c0ce4fbdc3..8494699a0d 100644
--- a/src/jit/lsra.h
+++ b/src/jit/lsra.h
@@ -1535,6 +1535,7 @@ private:
int BuildOperandUses(GenTree* node, regMaskTP candidates = RBM_NONE);
int BuildDelayFreeUses(GenTree* node, regMaskTP candidates = RBM_NONE);
int BuildIndirUses(GenTreeIndir* indirTree, regMaskTP candidates = RBM_NONE);
+ int BuildAddrUses(GenTree* addr, regMaskTP candidates = RBM_NONE);
void HandleFloatVarArgs(GenTreeCall* call, GenTree* argNode, bool* callHasFloatRegArgs);
RefPosition* BuildDef(GenTree* tree, regMaskTP dstCandidates = RBM_NONE, int multiRegIdx = 0);
void BuildDefs(GenTree* tree, int dstCount, regMaskTP dstCandidates = RBM_NONE);
diff --git a/src/jit/lsrabuild.cpp b/src/jit/lsrabuild.cpp
index 38dfca6985..aada9b2f75 100644
--- a/src/jit/lsrabuild.cpp
+++ b/src/jit/lsrabuild.cpp
@@ -2672,6 +2672,11 @@ RefPosition* LinearScan::BuildUse(GenTree* operand, regMaskTP candidates, int mu
int LinearScan::BuildIndirUses(GenTreeIndir* indirTree, regMaskTP candidates)
{
GenTree* const addr = indirTree->gtOp1;
+ return BuildAddrUses(addr, candidates);
+}
+
+int LinearScan::BuildAddrUses(GenTree* addr, regMaskTP candidates)
+{
if (!addr->isContained())
{
BuildUse(addr, candidates);
@@ -2725,11 +2730,17 @@ int LinearScan::BuildOperandUses(GenTree* node, regMaskTP candidates)
{
return BuildIndirUses(node->AsIndir(), candidates);
}
+#ifdef FEATURE_HW_INTRINSICS
if (node->OperIsHWIntrinsic())
{
+ if (node->AsHWIntrinsic()->OperIsMemoryLoad())
+ {
+ return BuildAddrUses(node->gtGetOp1());
+ }
BuildUse(node->gtGetOp1(), candidates);
return 1;
}
+#endif // FEATURE_HW_INTRINSICS
return 0;
}
diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp
index 364a4b7103..c430db049d 100644
--- a/src/jit/lsraxarch.cpp
+++ b/src/jit/lsraxarch.cpp
@@ -2699,11 +2699,29 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
{
assert((numArgs > 0) && (numArgs < 4));
- srcCount += BuildOperandUses(op1);
+ if (intrinsicTree->OperIsMemoryLoadOrStore())
+ {
+ srcCount += BuildAddrUses(op1);
+ }
+ else
+ {
+ srcCount += BuildOperandUses(op1);
+ }
if (op2 != nullptr)
{
- srcCount += (isRMW) ? BuildDelayFreeUses(op2) : BuildOperandUses(op2);
+ if (op2->OperIs(GT_HWIntrinsic) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained())
+ {
+ srcCount += BuildAddrUses(op2->gtGetOp1());
+ }
+ else if (isRMW)
+ {
+ srcCount += BuildDelayFreeUses(op2);
+ }
+ else
+ {
+ srcCount += BuildOperandUses(op2);
+ }
if (op3 != nullptr)
{
diff --git a/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.cs b/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.cs
new file mode 100644
index 0000000000..0e8f03c977
--- /dev/null
+++ b/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.cs
@@ -0,0 +1,143 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using System.Threading;
+
+// Test folding of addressing expressions
+
+public class Program
+{
+ struct S
+ {
+ public float f0;
+ public float f1;
+ public float f2;
+ public float f3;
+ public float f4;
+ public float f5;
+ public float f6;
+ public float f7;
+ public float f8;
+ public float f9;
+ public float f10;
+ public float f11;
+ public float f12;
+ public float f13;
+ public float f14;
+ public float f15;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static unsafe int Test(ref S s, Vector128<float> v, int offset)
+ {
+ int returnVal = 100;
+
+ if (Sse2.IsSupported)
+ {
+ fixed (float* p = &s.f0)
+ {
+ // We need an address aligned on 16 bytes, so we need to add a *float* offset to get there.
+ int alignmentOffset = (0x10 - ((int)p & 0xc)) >> 2;
+ try
+ {
+ // This is the aligned case.
+ // We're going to store a scalar at an offset of 2 from the aligned location.
+ // As it happens, we know that the struct has been initialized to all zeros,
+ // and the vector passed in was all ones, so now we have a one at offset 2.
+ Sse2.StoreScalar(p + alignmentOffset + 2, Sse2.Subtract(v, Sse2.LoadAlignedVector128(p + offset + alignmentOffset + 4)));
+
+ // Now do a load from the aligned location.
+ // That should give us {0, 0, 1, 0}.
+ Vector128<float> v2;
+ if (Sse41.IsSupported)
+ {
+ v2 = Sse41.LoadAlignedVector128NonTemporal((byte*)(p + alignmentOffset)).AsSingle();
+ }
+ else
+ {
+ v2 = Sse2.LoadVector128((byte*)(p + alignmentOffset)).AsSingle();
+ }
+ if (!v2.Equals(Vector128.Create(0.0F, 0.0F, 1.0F, 0.0F)))
+ {
+ Console.WriteLine("Aligned case FAILED: v2 = " + v2);
+ returnVal = -1;
+ }
+
+ // This is the unaligned case. The value we're loading to subtract is one element earlier than what we just stored.
+ // So we're doing { 1, 1, 1, 1 } - { 0, 1, 0, 0 } = { 1, 0, 1, 1 }
+ Sse2.Store(p + alignmentOffset + 1, Sse2.Subtract(v, Sse2.LoadVector128(p + offset + alignmentOffset + 1)));
+ // Now do an unaligned load from that location.
+ v2 = Sse2.LoadVector128(p + alignmentOffset + 1);
+ if (!v2.Equals(Vector128.Create(1.0F, 0.0F, 1.0F, 1.0F)))
+ {
+ Console.WriteLine("Unaligned case FAILED: v2 = " + v2);
+ returnVal = -1;
+ }
+
+ }
+ catch (Exception e)
+ {
+ Console.WriteLine("Unexpected exception: " + e.Message);
+ returnVal = -1;
+ }
+ }
+ }
+ return returnVal;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static unsafe int Test256(ref S s, Vector256<float> v, int offset)
+ {
+ int returnVal = 100;
+ if (Avx.IsSupported)
+ {
+ // offset must be a multiple of the vector size in floats.
+ offset &= ~3;
+ fixed (float* p = &s.f0)
+ {
+ try
+ {
+ Avx.Store(p + 1, Avx.Subtract(v, Avx.LoadVector256(p + offset + 1)));
+ Vector256<float> v2 = Avx.LoadVector256(p + 1);
+ if (!v2.Equals(v))
+ {
+ Console.WriteLine("Vector256 case FAILED: v = " + v + ", v2 = " + v2);
+ returnVal = -1;
+ }
+ }
+ catch (Exception e)
+ {
+ Console.WriteLine("Unexpected exception: " + e.Message);
+ returnVal = -1;
+ }
+ }
+ }
+ return returnVal;
+ }
+
+ static int Main()
+ {
+ S s = new S();
+ Vector128<float> v = Vector128.Create(1.0F);
+ int returnVal = Test(ref s, v, 0);
+ if (returnVal != 100)
+ {
+ Console.WriteLine("Vector128 test failed.");
+ }
+
+ // Get a new vector initialized to zeros.
+ S s2 = new S();
+ Vector256<float> v2 = Vector256.Create(1.0F);
+ if (Test256(ref s2, v2, 4) != 100)
+ {
+ Console.WriteLine("Vector256 test failed.");
+ returnVal = -1;
+ }
+ return returnVal;
+ }
+}
diff --git a/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.csproj b/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.csproj
new file mode 100644
index 0000000000..1ef998940b
--- /dev/null
+++ b/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.csproj
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+ <PropertyGroup>
+ <AssemblyName>$(MSBuildProjectName)</AssemblyName>
+ <OutputType>Exe</OutputType>
+ <DebugType>None</DebugType>
+ <Optimize>True</Optimize>
+ <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
+ </PropertyGroup>
+ <ItemGroup>
+ <Compile Include="$(MSBuildProjectName).cs" />
+ </ItemGroup>
+ <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+ <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>