summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTanner Gooding <tagoo@outlook.com>2018-05-23 11:20:29 -0700
committerTanner Gooding <tagoo@outlook.com>2018-05-25 16:08:06 -0700
commit5fc7dd5917d9e0d2345023a87f73a32c2ebfdbe8 (patch)
tree7eeacc428a501e73d38610e443bfcf0992f0853e
parent6977efd6b2a7b2d79479ac461da33b6512a72f90 (diff)
downloadcoreclr-5fc7dd5917d9e0d2345023a87f73a32c2ebfdbe8.tar.gz
coreclr-5fc7dd5917d9e0d2345023a87f73a32c2ebfdbe8.tar.bz2
coreclr-5fc7dd5917d9e0d2345023a87f73a32c2ebfdbe8.zip
Updating the JIT to handle the FMA hardware intrinsics.
-rw-r--r--src/jit/codegenlinear.h2
-rw-r--r--src/jit/emitxarch.cpp189
-rw-r--r--src/jit/emitxarch.h13
-rw-r--r--src/jit/gentree.cpp12
-rw-r--r--src/jit/hwintrinsiccodegenxarch.cpp208
-rw-r--r--src/jit/hwintrinsiclistxarch.h10
-rw-r--r--src/jit/hwintrinsicxarch.cpp2
-rw-r--r--src/jit/instrsxarch.h65
-rw-r--r--src/jit/lowerxarch.cpp45
-rw-r--r--src/jit/lsraxarch.cpp88
10 files changed, 630 insertions, 4 deletions
diff --git a/src/jit/codegenlinear.h b/src/jit/codegenlinear.h
index 5fa5b4aef0..0579fa5a00 100644
--- a/src/jit/codegenlinear.h
+++ b/src/jit/codegenlinear.h
@@ -117,6 +117,8 @@ void genHWIntrinsic(GenTreeHWIntrinsic* node);
#if defined(_TARGET_XARCH_)
void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins);
void genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins);
+void genHWIntrinsic_R_R_R_RM(
+ instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3);
void genSSEIntrinsic(GenTreeHWIntrinsic* node);
void genSSE2Intrinsic(GenTreeHWIntrinsic* node);
void genSSE41Intrinsic(GenTreeHWIntrinsic* node);
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp
index 4fdc5c8cb2..706ce5203d 100644
--- a/src/jit/emitxarch.cpp
+++ b/src/jit/emitxarch.cpp
@@ -45,6 +45,11 @@ bool IsAVXOnlyInstruction(instruction ins)
return (ins >= INS_FIRST_AVX_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION);
}
+bool IsFMAInstruction(instruction ins)
+{
+ return (ins >= INS_FIRST_FMA_INSTRUCTION) && (ins <= INS_LAST_FMA_INSTRUCTION);
+}
+
bool emitter::IsAVXInstruction(instruction ins)
{
return UseVEXEncoding() && IsSSEOrAVXInstruction(ins);
@@ -206,6 +211,66 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
case INS_unpcklps:
case INS_unpckhpd:
case INS_unpcklpd:
+ case INS_vfmadd132pd:
+ case INS_vfmadd213pd:
+ case INS_vfmadd231pd:
+ case INS_vfmadd132ps:
+ case INS_vfmadd213ps:
+ case INS_vfmadd231ps:
+ case INS_vfmadd132sd:
+ case INS_vfmadd213sd:
+ case INS_vfmadd231sd:
+ case INS_vfmadd132ss:
+ case INS_vfmadd213ss:
+ case INS_vfmadd231ss:
+ case INS_vfmaddsub132pd:
+ case INS_vfmaddsub213pd:
+ case INS_vfmaddsub231pd:
+ case INS_vfmaddsub132ps:
+ case INS_vfmaddsub213ps:
+ case INS_vfmaddsub231ps:
+ case INS_vfmsubadd132pd:
+ case INS_vfmsubadd213pd:
+ case INS_vfmsubadd231pd:
+ case INS_vfmsubadd132ps:
+ case INS_vfmsubadd213ps:
+ case INS_vfmsubadd231ps:
+ case INS_vfmsub132pd:
+ case INS_vfmsub213pd:
+ case INS_vfmsub231pd:
+ case INS_vfmsub132ps:
+ case INS_vfmsub213ps:
+ case INS_vfmsub231ps:
+ case INS_vfmsub132sd:
+ case INS_vfmsub213sd:
+ case INS_vfmsub231sd:
+ case INS_vfmsub132ss:
+ case INS_vfmsub213ss:
+ case INS_vfmsub231ss:
+ case INS_vfnmadd132pd:
+ case INS_vfnmadd213pd:
+ case INS_vfnmadd231pd:
+ case INS_vfnmadd132ps:
+ case INS_vfnmadd213ps:
+ case INS_vfnmadd231ps:
+ case INS_vfnmadd132sd:
+ case INS_vfnmadd213sd:
+ case INS_vfnmadd231sd:
+ case INS_vfnmadd132ss:
+ case INS_vfnmadd213ss:
+ case INS_vfnmadd231ss:
+ case INS_vfnmsub132pd:
+ case INS_vfnmsub213pd:
+ case INS_vfnmsub231pd:
+ case INS_vfnmsub132ps:
+ case INS_vfnmsub213ps:
+ case INS_vfnmsub231ps:
+ case INS_vfnmsub132sd:
+ case INS_vfnmsub213sd:
+ case INS_vfnmsub231sd:
+ case INS_vfnmsub132ss:
+ case INS_vfnmsub213ss:
+ case INS_vfnmsub231ss:
case INS_vinsertf128:
case INS_vinserti128:
case INS_vmaskmovps:
@@ -368,6 +433,36 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr)
case INS_vpsllvq:
case INS_pinsrq:
case INS_pextrq:
+ case INS_vfmadd132pd:
+ case INS_vfmadd213pd:
+ case INS_vfmadd231pd:
+ case INS_vfmadd132sd:
+ case INS_vfmadd213sd:
+ case INS_vfmadd231sd:
+ case INS_vfmaddsub132pd:
+ case INS_vfmaddsub213pd:
+ case INS_vfmaddsub231pd:
+ case INS_vfmsubadd132pd:
+ case INS_vfmsubadd213pd:
+ case INS_vfmsubadd231pd:
+ case INS_vfmsub132pd:
+ case INS_vfmsub213pd:
+ case INS_vfmsub231pd:
+ case INS_vfmsub132sd:
+ case INS_vfmsub213sd:
+ case INS_vfmsub231sd:
+ case INS_vfnmadd132pd:
+ case INS_vfnmadd213pd:
+ case INS_vfnmadd231pd:
+ case INS_vfnmadd132sd:
+ case INS_vfnmadd213sd:
+ case INS_vfnmadd231sd:
+ case INS_vfnmsub132pd:
+ case INS_vfnmsub213pd:
+ case INS_vfnmsub231pd:
+ case INS_vfnmsub132sd:
+ case INS_vfnmsub213sd:
+ case INS_vfnmsub231sd:
return true;
default:
break;
@@ -5360,12 +5455,85 @@ void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber reg,
}
}
+void emitter::emitIns_SIMD_R_R_R_A(
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, GenTreeIndir* indir)
+{
+ assert(IsFMAInstruction(ins));
+ assert(UseVEXEncoding());
+
+ if (reg != reg1)
+ {
+ // Ensure we aren't overwriting op2
+ assert(reg != reg2);
+
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
+ }
+
+ emitIns_R_R_A(ins, attr, reg, reg2, indir, IF_RWR_RRD_ARD);
+}
+
+void emitter::emitIns_SIMD_R_R_R_AR(
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber base)
+{
+ assert(IsFMAInstruction(ins));
+ assert(UseVEXEncoding());
+
+ if (reg != reg1)
+ {
+ // Ensure we aren't overwriting op2
+ assert(reg != reg2);
+
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
+ }
+
+ emitIns_R_R_AR(ins, attr, reg, reg2, base, 0);
+}
+
+void emitter::emitIns_SIMD_R_R_R_C(instruction ins,
+ emitAttr attr,
+ regNumber reg,
+ regNumber reg1,
+ regNumber reg2,
+ CORINFO_FIELD_HANDLE fldHnd,
+ int offs)
+{
+ assert(IsFMAInstruction(ins));
+ assert(UseVEXEncoding());
+
+ if (reg != reg1)
+ {
+ // Ensure we aren't overwriting op2
+ assert(reg != reg2);
+
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
+ }
+
+ emitIns_R_R_C(ins, attr, reg, reg2, fldHnd, offs);
+}
+
void emitter::emitIns_SIMD_R_R_R_R(
instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3)
{
- assert(isAvxBlendv(ins) || isSse41Blendv(ins));
- if (UseVEXEncoding())
+ if (IsFMAInstruction(ins))
{
+ assert(UseVEXEncoding());
+
+ if (reg != reg1)
+ {
+ // Ensure we aren't overwriting op2 or op3
+
+ assert(reg != reg2);
+ assert(reg != reg3);
+
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
+ }
+
+ emitIns_R_R_R(ins, attr, reg, reg2, reg3);
+ }
+ else if (UseVEXEncoding())
+ {
+ assert(isAvxBlendv(ins) || isSse41Blendv(ins));
+
// convert SSE encoding of SSE4.1 instructions to VEX encoding
switch (ins)
{
@@ -5407,6 +5575,23 @@ void emitter::emitIns_SIMD_R_R_R_R(
}
}
+void emitter::emitIns_SIMD_R_R_R_S(
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int varx, int offs)
+{
+ assert(IsFMAInstruction(ins));
+ assert(UseVEXEncoding());
+
+ if (reg != reg1)
+ {
+ // Ensure we aren't overwriting op2
+ assert(reg != reg2);
+
+ emitIns_R_R(INS_movaps, attr, reg, reg1);
+ }
+
+ emitIns_R_R_S(ins, attr, reg, reg2, varx, offs);
+}
+
void emitter::emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs)
{
if (UseVEXEncoding())
diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h
index f85157e1eb..31cafa1cf2 100644
--- a/src/jit/emitxarch.h
+++ b/src/jit/emitxarch.h
@@ -422,8 +422,21 @@ void emitIns_SIMD_R_R_C(
void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs);
void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2);
void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int ival);
+void emitIns_SIMD_R_R_R_A(
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, GenTreeIndir* indir);
+void emitIns_SIMD_R_R_R_AR(
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber base);
+void emitIns_SIMD_R_R_R_C(instruction ins,
+ emitAttr attr,
+ regNumber reg,
+ regNumber reg1,
+ regNumber reg2,
+ CORINFO_FIELD_HANDLE fldHnd,
+ int offs);
void emitIns_SIMD_R_R_R_R(
instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3);
+void emitIns_SIMD_R_R_R_S(
+ instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int varx, int offs);
#endif // FEATURE_HW_INTRINSICS
enum EmitCallType
diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp
index 965538dcaf..d0230b8e29 100644
--- a/src/jit/gentree.cpp
+++ b/src/jit/gentree.cpp
@@ -17420,7 +17420,19 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp)
switch (AsHWIntrinsic()->gtHWIntrinsicId)
{
+ // TODO-XArch-Cleanup: Move this switch block to be table driven.
+
case NI_SSE42_Crc32:
+ case NI_FMA_MultiplyAdd:
+ case NI_FMA_MultiplyAddNegated:
+ case NI_FMA_MultiplyAddNegatedScalar:
+ case NI_FMA_MultiplyAddScalar:
+ case NI_FMA_MultiplyAddSubtract:
+ case NI_FMA_MultiplySubtract:
+ case NI_FMA_MultiplySubtractAdd:
+ case NI_FMA_MultiplySubtractNegated:
+ case NI_FMA_MultiplySubtractNegatedScalar:
+ case NI_FMA_MultiplySubtractScalar:
return true;
default:
diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp
index d8cac11cb0..42f0c7112c 100644
--- a/src/jit/hwintrinsiccodegenxarch.cpp
+++ b/src/jit/hwintrinsiccodegenxarch.cpp
@@ -569,6 +569,124 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
}
}
+//------------------------------------------------------------------------
+// genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands,
+// a register/memory operand, and that returns a value in register
+//
+// Arguments:
+// ins - The instruction being generated
+// attr - The emit attribute
+// targetReg - The target register
+// op1Reg - The register of the first operand
+// op2Reg - The register of the second operand
+// op3 - The third operand
+//
+void CodeGen::genHWIntrinsic_R_R_R_RM(
+ instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3)
+{
+ assert(targetReg != REG_NA);
+ assert(op1Reg != REG_NA);
+ assert(op2Reg != REG_NA);
+
+ emitter* emit = getEmitter();
+
+ if (op3->isContained() || op3->isUsedFromSpillTemp())
+ {
+ TempDsc* tmpDsc = nullptr;
+ unsigned varNum = BAD_VAR_NUM;
+ unsigned offset = (unsigned)-1;
+
+ if (op3->isUsedFromSpillTemp())
+ {
+ assert(op3->IsRegOptional());
+
+ // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common
+ // pattern. It could probably be extracted to its own method.
+ tmpDsc = getSpillTempDsc(op3);
+ varNum = tmpDsc->tdTempNum();
+ offset = 0;
+
+ compiler->tmpRlsTemp(tmpDsc);
+ }
+ else if (op3->OperIsHWIntrinsic())
+ {
+ emit->emitIns_SIMD_R_R_R_AR(ins, attr, targetReg, op1Reg, op2Reg, op3->gtGetOp1()->gtRegNum);
+ return;
+ }
+ else if (op3->isIndir())
+ {
+ GenTreeIndir* memIndir = op3->AsIndir();
+ GenTree* memBase = memIndir->gtOp1;
+
+ switch (memBase->OperGet())
+ {
+ case GT_LCL_VAR_ADDR:
+ {
+ varNum = memBase->AsLclVarCommon()->GetLclNum();
+ offset = 0;
+
+ // Ensure that all the GenTreeIndir values are set to their defaults.
+ assert(!memIndir->HasIndex());
+ assert(memIndir->Scale() == 1);
+ assert(memIndir->Offset() == 0);
+
+ break;
+ }
+
+ case GT_CLS_VAR_ADDR:
+ {
+ emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, memBase->gtClsVar.gtClsVarHnd, 0);
+ return;
+ }
+
+ default:
+ {
+ emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir);
+ return;
+ }
+ }
+ }
+ else
+ {
+ switch (op3->OperGet())
+ {
+ case GT_LCL_FLD:
+ {
+ GenTreeLclFld* lclField = op3->AsLclFld();
+
+ varNum = lclField->GetLclNum();
+ offset = lclField->gtLclFld.gtLclOffs;
+ break;
+ }
+
+ case GT_LCL_VAR:
+ {
+ assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate());
+ varNum = op3->AsLclVar()->GetLclNum();
+ offset = 0;
+ break;
+ }
+
+ default:
+ unreached();
+ break;
+ }
+ }
+
+ // Ensure we got a good varNum and offset.
+ // We also need to check for `tmpDsc != nullptr` since spill temp numbers
+ // are negative and start with -1, which also happens to be BAD_VAR_NUM.
+ assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
+ assert(offset != (unsigned)-1);
+
+ emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset);
+ }
+ else
+ {
+ emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum);
+ }
+}
+
// genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
// with non-constant argument
//
@@ -1560,7 +1678,95 @@ void CodeGen::genBMI2Intrinsic(GenTreeHWIntrinsic* node)
//
void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
{
- NYI("Implement FMA intrinsic code generation");
+ NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
+ var_types baseType = node->gtSIMDBaseType;
+ HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID);
+ emitAttr attr = EA_ATTR(node->gtSIMDSize);
+ instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+ GenTree* op1 = node->gtGetOp1();
+ regNumber targetReg = node->gtRegNum;
+
+ assert(Compiler::numArgsOfHWIntrinsic(node) == 3);
+ assert(op1 != nullptr);
+ assert(op1->OperIsList());
+ assert(op1->gtGetOp2()->OperIsList());
+ assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
+
+ GenTreeArgList* argList = op1->AsArgList();
+ op1 = argList->Current();
+ genConsumeRegs(op1);
+
+ argList = argList->Rest();
+ GenTree* op2 = argList->Current();
+ genConsumeRegs(op2);
+
+ argList = argList->Rest();
+ GenTree* op3 = argList->Current();
+ genConsumeRegs(op3);
+
+ regNumber op1Reg;
+ regNumber op2Reg;
+
+ bool isCommutative = false;
+ bool copyUpperBits = (flags & HW_Flag_CopyUpperBits) != 0;
+
+ // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
+ assert(!copyUpperBits || !op1->isContained());
+
+ if (op3->isContained() || op3->isUsedFromSpillTemp())
+ {
+ // 213 form: op1 = (op2 * op1) + [op3]
+
+ op1Reg = op1->gtRegNum;
+ op2Reg = op2->gtRegNum;
+
+ isCommutative = !copyUpperBits;
+ }
+ else if (op2->isContained() || op2->isUsedFromSpillTemp())
+ {
+ // 132 form: op1 = (op1 * op3) + [op2]
+
+ ins = (instruction)(ins - 1);
+ op1Reg = op1->gtRegNum;
+ op2Reg = op3->gtRegNum;
+ op3 = op2;
+ }
+ else if (op1->isContained() || op1->isUsedFromSpillTemp())
+ {
+ // 231 form: op3 = (op2 * op3) + [op1]
+
+ ins = (instruction)(ins + 1);
+ op1Reg = op3->gtRegNum;
+ op2Reg = op2->gtRegNum;
+ op3 = op1;
+ }
+ else
+ {
+ // 213 form: op1 = (op2 * op1) + op3
+
+ op1Reg = op1->gtRegNum;
+ op2Reg = op2->gtRegNum;
+
+ isCommutative = !copyUpperBits;
+ }
+
+ if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg))
+ {
+ assert(node->isRMWHWIntrinsic(compiler));
+
+ // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic.
+ //
+ // For non-commutative intrinsics, we should have ensured that op2 was marked
+ // delay free in order to prevent it from getting assigned the same register
+ // as target. However, for commutative intrinsics, we can just swap the operands
+ // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
+
+ op2Reg = op1Reg;
+ op1Reg = targetReg;
+ }
+
+ genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3);
+ genProduceReg(node);
}
//------------------------------------------------------------------------
diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h
index c6450f45ac..48a6888dc1 100644
--- a/src/jit/hwintrinsiclistxarch.h
+++ b/src/jit/hwintrinsiclistxarch.h
@@ -476,6 +476,16 @@ HARDWARE_INTRINSIC(BMI2_IsSupported, "get_IsSuppo
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// FMA Intrinsics
HARDWARE_INTRINSIC(FMA_IsSupported, "get_IsSupported", FMA, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(FMA_MultiplyAdd, "MultiplyAdd", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmadd213ps, INS_vfmadd213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(FMA_MultiplyAddNegated, "MultiplyAddNegated", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmadd213ps, INS_vfnmadd213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(FMA_MultiplyAddNegatedScalar, "MultiplyAddNegatedScalar", FMA, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmadd213ss, INS_vfnmadd213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(FMA_MultiplyAddScalar, "MultiplyAddScalar", FMA, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmadd213ss, INS_vfmadd213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(FMA_MultiplyAddSubtract, "MultiplyAddSubtract", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmaddsub213ps, INS_vfmaddsub213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(FMA_MultiplySubtract, "MultiplySubtract", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmsub213ps, INS_vfmsub213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(FMA_MultiplySubtractAdd, "MultiplySubtractAdd", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmsubadd213ps, INS_vfmsubadd213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(FMA_MultiplySubtractNegated, "MultiplySubtractNegated", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmsub213ps, INS_vfnmsub213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(FMA_MultiplySubtractScalar, "MultiplySubtractScalar", FMA, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmsub213ss, INS_vfmsub213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(FMA_MultiplySubtractNegatedScalar, "MultiplySubtractNegatedScalar", FMA, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmsub213ss, INS_vfnmsub213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_CopyUpperBits)
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags
diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp
index a6a5f1452c..d3cdbb33df 100644
--- a/src/jit/hwintrinsicxarch.cpp
+++ b/src/jit/hwintrinsicxarch.cpp
@@ -547,7 +547,6 @@ bool Compiler::isFullyImplmentedISAClass(InstructionSet isa)
case InstructionSet_AES:
case InstructionSet_BMI1:
case InstructionSet_BMI2:
- case InstructionSet_FMA:
case InstructionSet_PCLMULQDQ:
return false;
@@ -563,6 +562,7 @@ bool Compiler::isFullyImplmentedISAClass(InstructionSet isa)
case InstructionSet_SSE3:
case InstructionSet_SSSE3:
case InstructionSet_SSE41:
+ case InstructionSet_FMA:
case InstructionSet_LZCNT:
case InstructionSet_POPCNT:
return true;
diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h
index fa93ad217c..489baa8ec3 100644
--- a/src/jit/instrsxarch.h
+++ b/src/jit/instrsxarch.h
@@ -499,6 +499,71 @@ INST3(vbroadcastf128,"broadcastf128",0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS
INST3(vbroadcasti128,"broadcasti128",0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x5A)) // Broadcast packed integer values read from memory to entire ymm register
INST3(vmaskmovps, "maskmovps" ,0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2C)) // Conditional SIMD Packed Loads Float
INST3(vmaskmovpd, "maskmovpd" ,0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2D)) // Conditional SIMD Packed Loads Double
+
+INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
+// enum name FP updmode rf wf MR MI RM
+INST3(vfmadd132pd, "fmadd132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x98)) // Fused Multiply-Add of Packed Double-Precision Floating-Point Values
+INST3(vfmadd213pd, "fmadd213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA8)) //
+INST3(vfmadd231pd, "fmadd231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB8)) //
+INST3(vfmadd132ps, "fmadd132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x98)) // Fused Multiply-Add of Packed Single-Precision Floating-Point Values
+INST3(vfmadd213ps, "fmadd213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA8)) //
+INST3(vfmadd231ps, "fmadd231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB8)) //
+INST3(vfmadd132sd, "fmadd132sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x99)) // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values
+INST3(vfmadd213sd, "fmadd213sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA9)) //
+INST3(vfmadd231sd, "fmadd231sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB9)) //
+INST3(vfmadd132ss, "fmadd132ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x99)) // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values
+INST3(vfmadd213ss, "fmadd213ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA9)) //
+INST3(vfmadd231ss, "fmadd231ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB9)) //
+INST3(vfmaddsub132pd, "fmaddsub132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x96)) // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values
+INST3(vfmaddsub213pd, "fmaddsub213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA6)) //
+INST3(vfmaddsub231pd, "fmaddsub231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB6)) //
+INST3(vfmaddsub132ps, "fmaddsub132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x96)) // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values
+INST3(vfmaddsub213ps, "fmaddsub213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA6)) //
+INST3(vfmaddsub231ps, "fmaddsub231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB6)) //
+INST3(vfmsubadd132pd, "fmsubadd132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x97)) // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values
+INST3(vfmsubadd213pd, "fmsubadd213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA7)) //
+INST3(vfmsubadd231pd, "fmsubadd231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB7)) //
+INST3(vfmsubadd132ps, "fmsubadd132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x97)) // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values
+INST3(vfmsubadd213ps, "fmsubadd213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA7)) //
+INST3(vfmsubadd231ps, "fmsubadd231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB7)) //
+INST3(vfmsub132pd, "fmsub132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9A)) // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values
+INST3(vfmsub213pd, "fmsub213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAA)) //
+INST3(vfmsub231pd, "fmsub231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBA)) //
+INST3(vfmsub132ps, "fmsub132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9A)) // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values
+INST3(vfmsub213ps, "fmsub213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAA)) //
+INST3(vfmsub231ps, "fmsub231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBA)) //
+INST3(vfmsub132sd, "fmsub132sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9B)) // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values
+INST3(vfmsub213sd, "fmsub213sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAB)) //
+INST3(vfmsub231sd, "fmsub231sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBB)) //
+INST3(vfmsub132ss, "fmsub132ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9B)) // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values
+INST3(vfmsub213ss, "fmsub213ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAB)) //
+INST3(vfmsub231ss, "fmsub231ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBB)) //
+INST3(vfnmadd132pd, "fmnadd132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9C)) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values
+INST3(vfnmadd213pd, "fmnadd213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAC)) //
+INST3(vfnmadd231pd, "fmnadd231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBC)) //
+INST3(vfnmadd132ps, "fmnadd132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9C)) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values
+INST3(vfnmadd213ps, "fmnadd213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAC)) //
+INST3(vfnmadd231ps, "fmnadd231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBC)) //
+INST3(vfnmadd132sd, "fmnadd132sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9D)) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values
+INST3(vfnmadd213sd, "fmnadd213sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAD)) //
+INST3(vfnmadd231sd, "fmnadd231sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBD)) //
+INST3(vfnmadd132ss, "fmnadd132ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9D)) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values
+INST3(vfnmadd213ss, "fmnadd213ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAD)) //
+INST3(vfnmadd231ss, "fmnadd231ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBD)) //
+INST3(vfnmsub132pd, "fmnsub132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9E)) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values
+INST3(vfnmsub213pd, "fmnsub213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAE)) //
+INST3(vfnmsub231pd, "fmnsub231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBE)) //
+INST3(vfnmsub132ps, "fmnsub132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9E)) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values
+INST3(vfnmsub213ps, "fmnsub213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAE)) //
+INST3(vfnmsub231ps, "fmnsub231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBE)) //
+INST3(vfnmsub132sd, "fmnsub132sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9F)) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values
+INST3(vfnmsub213sd, "fmnsub213sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAF)) //
+INST3(vfnmsub231sd, "fmnsub231sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBF)) //
+INST3(vfnmsub132ss, "fmnsub132ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9F)) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values
+INST3(vfnmsub213ss, "fmnsub213ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAF)) //
+INST3(vfnmsub231ss, "fmnsub231ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBF)) //
+INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
+
INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
// Scalar instructions in SSE4.2
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 0bf0c241a5..d53d3a5aa0 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -2410,6 +2410,51 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}
}
+ else if ((intrinsicID >= NI_FMA_MultiplyAdd) && (intrinsicID <= NI_FMA_MultiplySubtractNegatedScalar))
+ {
+ assert(numArgs == 3);
+ assert(op1->OperIsList());
+
+ GenTreeArgList* argList = op1->AsArgList();
+ op1 = argList->Current();
+
+ argList = argList->Rest();
+ GenTree* op2 = argList->Current();
+
+ argList = argList->Rest();
+ GenTree* op3 = argList->Current();
+
+ if (IsContainableHWIntrinsicOp(node, op3))
+ {
+ // 213 form: op1 = (op2 * op1) + [op3]
+ MakeSrcContained(node, op3);
+ }
+ else if (IsContainableHWIntrinsicOp(node, op2))
+ {
+ // 132 form: op1 = (op1 * op3) + [op2]
+ MakeSrcContained(node, op2);
+ }
+ else if (IsContainableHWIntrinsicOp(node, op1))
+ {
+ // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
+
+ if ((flags & HW_Flag_CopyUpperBits) == 0)
+ {
+ // 231 form: op3 = (op2 * op3) + [op1]
+ MakeSrcContained(node, op1);
+ }
+ }
+ else
+ {
+ // TODO-XArch-CQ: Technically any one of the three operands can
+ // be reg-optional. With a limitation on op1 where
+ // it can only be so if CopyUpperBits is off.
+ // https://github.com/dotnet/coreclr/issues/6361
+
+ // 213 form: op1 = (op2 * op1) + op3
+ op3->SetRegOptional();
+ }
+ }
if (Compiler::categoryOfHWIntrinsic(intrinsicID) == HW_Category_IMM)
{
diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp
index 3c5157d80f..96af2f6d8a 100644
--- a/src/jit/lsraxarch.cpp
+++ b/src/jit/lsraxarch.cpp
@@ -2508,6 +2508,94 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
}
#endif // _TARGET_X86_
+ case NI_FMA_MultiplyAdd:
+ case NI_FMA_MultiplyAddNegated:
+ case NI_FMA_MultiplyAddNegatedScalar:
+ case NI_FMA_MultiplyAddScalar:
+ case NI_FMA_MultiplyAddSubtract:
+ case NI_FMA_MultiplySubtract:
+ case NI_FMA_MultiplySubtractAdd:
+ case NI_FMA_MultiplySubtractNegated:
+ case NI_FMA_MultiplySubtractNegatedScalar:
+ case NI_FMA_MultiplySubtractScalar:
+ {
+ assert(numArgs == 3);
+ assert(isRMW);
+
+ bool copyUpperBits = (flags & HW_Flag_CopyUpperBits) != 0;
+
+ // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
+ assert(!copyUpperBits || !op1->isContained());
+
+ if (op3->isContained())
+ {
+ // 213 form: op1 = (op2 * op1) + [op3]
+
+ if (copyUpperBits)
+ {
+ tgtPrefUse = BuildUse(op1);
+
+ srcCount += 1;
+ srcCount += BuildDelayFreeUses(op2);
+ }
+ else
+ {
+ // op1 and op2 are commutative, so don't
+ // set either to be tgtPref or delayFree
+
+ srcCount += BuildOperandUses(op1);
+ srcCount += BuildOperandUses(op2);
+ }
+
+ srcCount += BuildOperandUses(op3);
+ }
+ else if (op2->isContained())
+ {
+ // 132 form: op1 = (op1 * op3) + [op2]
+
+ tgtPrefUse = BuildUse(op1);
+
+ srcCount += 1;
+ srcCount += BuildOperandUses(op2);
+ srcCount += BuildDelayFreeUses(op3);
+ }
+ else if (op1->isContained())
+ {
+ // 231 form: op3 = (op2 * op3) + [op1]
+
+ tgtPrefUse = BuildUse(op3);
+
+ srcCount += BuildOperandUses(op1);
+ srcCount += BuildDelayFreeUses(op2);
+ srcCount += 1;
+ }
+ else
+ {
+ // 213 form: op1 = (op2 * op1) + op3
+
+ if (copyUpperBits)
+ {
+ tgtPrefUse = BuildUse(op1);
+
+ srcCount += 1;
+ srcCount += BuildDelayFreeUses(op2);
+ }
+ else
+ {
+ // op1 and op2 are commutative, so don't
+ // set either to be tgtPref or delayFree
+
+ srcCount += BuildOperandUses(op1);
+ srcCount += BuildOperandUses(op2);
+ }
+
+ srcCount += BuildDelayFreeUses(op3);
+ }
+
+ buildUses = false;
+ break;
+ }
+
default:
{
assert((intrinsicID > NI_HW_INTRINSIC_START) && (intrinsicID < NI_HW_INTRINSIC_END));