diff options
author | Tanner Gooding <tagoo@outlook.com> | 2018-05-23 11:20:29 -0700 |
---|---|---|
committer | Tanner Gooding <tagoo@outlook.com> | 2018-05-25 16:08:06 -0700 |
commit | 5fc7dd5917d9e0d2345023a87f73a32c2ebfdbe8 (patch) | |
tree | 7eeacc428a501e73d38610e443bfcf0992f0853e | |
parent | 6977efd6b2a7b2d79479ac461da33b6512a72f90 (diff) | |
download | coreclr-5fc7dd5917d9e0d2345023a87f73a32c2ebfdbe8.tar.gz coreclr-5fc7dd5917d9e0d2345023a87f73a32c2ebfdbe8.tar.bz2 coreclr-5fc7dd5917d9e0d2345023a87f73a32c2ebfdbe8.zip |
Updating the JIT to handle the FMA hardware intrinsics.
-rw-r--r-- | src/jit/codegenlinear.h | 2 | ||||
-rw-r--r-- | src/jit/emitxarch.cpp | 189 | ||||
-rw-r--r-- | src/jit/emitxarch.h | 13 | ||||
-rw-r--r-- | src/jit/gentree.cpp | 12 | ||||
-rw-r--r-- | src/jit/hwintrinsiccodegenxarch.cpp | 208 | ||||
-rw-r--r-- | src/jit/hwintrinsiclistxarch.h | 10 | ||||
-rw-r--r-- | src/jit/hwintrinsicxarch.cpp | 2 | ||||
-rw-r--r-- | src/jit/instrsxarch.h | 65 | ||||
-rw-r--r-- | src/jit/lowerxarch.cpp | 45 | ||||
-rw-r--r-- | src/jit/lsraxarch.cpp | 88 |
10 files changed, 630 insertions, 4 deletions
diff --git a/src/jit/codegenlinear.h b/src/jit/codegenlinear.h index 5fa5b4aef0..0579fa5a00 100644 --- a/src/jit/codegenlinear.h +++ b/src/jit/codegenlinear.h @@ -117,6 +117,8 @@ void genHWIntrinsic(GenTreeHWIntrinsic* node); #if defined(_TARGET_XARCH_) void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins); void genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins); +void genHWIntrinsic_R_R_R_RM( + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3); void genSSEIntrinsic(GenTreeHWIntrinsic* node); void genSSE2Intrinsic(GenTreeHWIntrinsic* node); void genSSE41Intrinsic(GenTreeHWIntrinsic* node); diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index 4fdc5c8cb2..706ce5203d 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -45,6 +45,11 @@ bool IsAVXOnlyInstruction(instruction ins) return (ins >= INS_FIRST_AVX_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); } +bool IsFMAInstruction(instruction ins) +{ + return (ins >= INS_FIRST_FMA_INSTRUCTION) && (ins <= INS_LAST_FMA_INSTRUCTION); +} + bool emitter::IsAVXInstruction(instruction ins) { return UseVEXEncoding() && IsSSEOrAVXInstruction(ins); @@ -206,6 +211,66 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) case INS_unpcklps: case INS_unpckhpd: case INS_unpcklpd: + case INS_vfmadd132pd: + case INS_vfmadd213pd: + case INS_vfmadd231pd: + case INS_vfmadd132ps: + case INS_vfmadd213ps: + case INS_vfmadd231ps: + case INS_vfmadd132sd: + case INS_vfmadd213sd: + case INS_vfmadd231sd: + case INS_vfmadd132ss: + case INS_vfmadd213ss: + case INS_vfmadd231ss: + case INS_vfmaddsub132pd: + case INS_vfmaddsub213pd: + case INS_vfmaddsub231pd: + case INS_vfmaddsub132ps: + case INS_vfmaddsub213ps: + case INS_vfmaddsub231ps: + case INS_vfmsubadd132pd: + case INS_vfmsubadd213pd: + case INS_vfmsubadd231pd: + case INS_vfmsubadd132ps: + case INS_vfmsubadd213ps: + case INS_vfmsubadd231ps: + case INS_vfmsub132pd: + case INS_vfmsub213pd: + case INS_vfmsub231pd: + case INS_vfmsub132ps: + case INS_vfmsub213ps: + case INS_vfmsub231ps: + case INS_vfmsub132sd: + case INS_vfmsub213sd: + case INS_vfmsub231sd: + case INS_vfmsub132ss: + case INS_vfmsub213ss: + case INS_vfmsub231ss: + case INS_vfnmadd132pd: + case INS_vfnmadd213pd: + case INS_vfnmadd231pd: + case INS_vfnmadd132ps: + case INS_vfnmadd213ps: + case INS_vfnmadd231ps: + case INS_vfnmadd132sd: + case INS_vfnmadd213sd: + case INS_vfnmadd231sd: + case INS_vfnmadd132ss: + case INS_vfnmadd213ss: + case INS_vfnmadd231ss: + case INS_vfnmsub132pd: + case INS_vfnmsub213pd: + case INS_vfnmsub231pd: + case INS_vfnmsub132ps: + case INS_vfnmsub213ps: + case INS_vfnmsub231ps: + case INS_vfnmsub132sd: + case INS_vfnmsub213sd: + case INS_vfnmsub231sd: + case INS_vfnmsub132ss: + case INS_vfnmsub213ss: + case INS_vfnmsub231ss: case INS_vinsertf128: case INS_vinserti128: case INS_vmaskmovps: @@ -368,6 +433,36 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr) case INS_vpsllvq: case INS_pinsrq: case INS_pextrq: + case INS_vfmadd132pd: + case INS_vfmadd213pd: + case INS_vfmadd231pd: + case INS_vfmadd132sd: + case INS_vfmadd213sd: + case INS_vfmadd231sd: + case INS_vfmaddsub132pd: + case INS_vfmaddsub213pd: + case INS_vfmaddsub231pd: + case INS_vfmsubadd132pd: + case INS_vfmsubadd213pd: + case INS_vfmsubadd231pd: + case INS_vfmsub132pd: + case INS_vfmsub213pd: + case INS_vfmsub231pd: + case INS_vfmsub132sd: + case INS_vfmsub213sd: + case INS_vfmsub231sd: + case INS_vfnmadd132pd: + case INS_vfnmadd213pd: + case INS_vfnmadd231pd: + case INS_vfnmadd132sd: + case INS_vfnmadd213sd: + case INS_vfnmadd231sd: + case INS_vfnmsub132pd: + case INS_vfnmsub213pd: + case INS_vfnmsub231pd: + case INS_vfnmsub132sd: + case INS_vfnmsub213sd: + case INS_vfnmsub231sd: return true; default: break; @@ -5360,12 +5455,85 @@ void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber reg, } } +void emitter::emitIns_SIMD_R_R_R_A( + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, GenTreeIndir* indir) +{ + assert(IsFMAInstruction(ins)); + assert(UseVEXEncoding()); + + if (reg != reg1) + { + // Ensure we aren't overwriting op2 + assert(reg != reg2); + + emitIns_R_R(INS_movaps, attr, reg, reg1); + } + + emitIns_R_R_A(ins, attr, reg, reg2, indir, IF_RWR_RRD_ARD); +} + +void emitter::emitIns_SIMD_R_R_R_AR( + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber base) +{ + assert(IsFMAInstruction(ins)); + assert(UseVEXEncoding()); + + if (reg != reg1) + { + // Ensure we aren't overwriting op2 + assert(reg != reg2); + + emitIns_R_R(INS_movaps, attr, reg, reg1); + } + + emitIns_R_R_AR(ins, attr, reg, reg2, base, 0); +} + +void emitter::emitIns_SIMD_R_R_R_C(instruction ins, + emitAttr attr, + regNumber reg, + regNumber reg1, + regNumber reg2, + CORINFO_FIELD_HANDLE fldHnd, + int offs) +{ + assert(IsFMAInstruction(ins)); + assert(UseVEXEncoding()); + + if (reg != reg1) + { + // Ensure we aren't overwriting op2 + assert(reg != reg2); + + emitIns_R_R(INS_movaps, attr, reg, reg1); + } + + emitIns_R_R_C(ins, attr, reg, reg2, fldHnd, offs); +} + void emitter::emitIns_SIMD_R_R_R_R( instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3) { - assert(isAvxBlendv(ins) || isSse41Blendv(ins)); - if (UseVEXEncoding()) + if (IsFMAInstruction(ins)) { + assert(UseVEXEncoding()); + + if (reg != reg1) + { + // Ensure we aren't overwriting op2 or op3 + + assert(reg != reg2); + assert(reg != reg3); + + emitIns_R_R(INS_movaps, attr, reg, reg1); + } + + emitIns_R_R_R(ins, attr, reg, reg2, reg3); + } + else if (UseVEXEncoding()) + { + assert(isAvxBlendv(ins) || isSse41Blendv(ins)); + // convert SSE encoding of SSE4.1 instructions to VEX encoding switch (ins) { @@ -5407,6 +5575,23 @@ void emitter::emitIns_SIMD_R_R_R_R( } } +void emitter::emitIns_SIMD_R_R_R_S( + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int varx, int offs) +{ + assert(IsFMAInstruction(ins)); + assert(UseVEXEncoding()); + + if (reg != reg1) + { + // Ensure we aren't overwriting op2 + assert(reg != reg2); + + emitIns_R_R(INS_movaps, attr, reg, reg1); + } + + emitIns_R_R_S(ins, attr, reg, reg2, varx, offs); +} + void emitter::emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs) { if (UseVEXEncoding()) diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h index f85157e1eb..31cafa1cf2 100644 --- a/src/jit/emitxarch.h +++ b/src/jit/emitxarch.h @@ -422,8 +422,21 @@ void emitIns_SIMD_R_R_C( void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs); void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2); void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int ival); +void emitIns_SIMD_R_R_R_A( + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, GenTreeIndir* indir); +void emitIns_SIMD_R_R_R_AR( + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber base); +void emitIns_SIMD_R_R_R_C(instruction ins, + emitAttr attr, + regNumber reg, + regNumber reg1, + regNumber reg2, + CORINFO_FIELD_HANDLE fldHnd, + int offs); void emitIns_SIMD_R_R_R_R( instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3); +void emitIns_SIMD_R_R_R_S( + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int varx, int offs); #endif // FEATURE_HW_INTRINSICS enum EmitCallType diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp index 965538dcaf..d0230b8e29 100644 --- a/src/jit/gentree.cpp +++ b/src/jit/gentree.cpp @@ -17420,7 +17420,19 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp) switch (AsHWIntrinsic()->gtHWIntrinsicId) { + // TODO-XArch-Cleanup: Move this switch block to be table driven. + case NI_SSE42_Crc32: + case NI_FMA_MultiplyAdd: + case NI_FMA_MultiplyAddNegated: + case NI_FMA_MultiplyAddNegatedScalar: + case NI_FMA_MultiplyAddScalar: + case NI_FMA_MultiplyAddSubtract: + case NI_FMA_MultiplySubtract: + case NI_FMA_MultiplySubtractAdd: + case NI_FMA_MultiplySubtractNegated: + case NI_FMA_MultiplySubtractNegatedScalar: + case NI_FMA_MultiplySubtractScalar: return true; default: diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp index d8cac11cb0..42f0c7112c 100644 --- a/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/jit/hwintrinsiccodegenxarch.cpp @@ -569,6 +569,124 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins) } } +//------------------------------------------------------------------------ +// genHWIntrinsic_R_R_R_RM: Generates the code for a hardware intrinsic node that takes two register operands, +// a register/memory operand, and that returns a value in register +// +// Arguments: +// ins - The instruction being generated +// attr - The emit attribute +// targetReg - The target register +// op1Reg - The register of the first operand +// op2Reg - The register of the second operand +// op3 - The third operand +// +void CodeGen::genHWIntrinsic_R_R_R_RM( + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3) +{ + assert(targetReg != REG_NA); + assert(op1Reg != REG_NA); + assert(op2Reg != REG_NA); + + emitter* emit = getEmitter(); + + if (op3->isContained() || op3->isUsedFromSpillTemp()) + { + TempDsc* tmpDsc = nullptr; + unsigned varNum = BAD_VAR_NUM; + unsigned offset = (unsigned)-1; + + if (op3->isUsedFromSpillTemp()) + { + assert(op3->IsRegOptional()); + + // TODO-XArch-Cleanup: The getSpillTempDsc...tempRlsTemp code is a fairly common + // pattern. It could probably be extracted to its own method. + tmpDsc = getSpillTempDsc(op3); + varNum = tmpDsc->tdTempNum(); + offset = 0; + + compiler->tmpRlsTemp(tmpDsc); + } + else if (op3->OperIsHWIntrinsic()) + { + emit->emitIns_SIMD_R_R_R_AR(ins, attr, targetReg, op1Reg, op2Reg, op3->gtGetOp1()->gtRegNum); + return; + } + else if (op3->isIndir()) + { + GenTreeIndir* memIndir = op3->AsIndir(); + GenTree* memBase = memIndir->gtOp1; + + switch (memBase->OperGet()) + { + case GT_LCL_VAR_ADDR: + { + varNum = memBase->AsLclVarCommon()->GetLclNum(); + offset = 0; + + // Ensure that all the GenTreeIndir values are set to their defaults. + assert(!memIndir->HasIndex()); + assert(memIndir->Scale() == 1); + assert(memIndir->Offset() == 0); + + break; + } + + case GT_CLS_VAR_ADDR: + { + emit->emitIns_SIMD_R_R_R_C(ins, attr, targetReg, op1Reg, op2Reg, memBase->gtClsVar.gtClsVarHnd, 0); + return; + } + + default: + { + emit->emitIns_SIMD_R_R_R_A(ins, attr, targetReg, op1Reg, op2Reg, memIndir); + return; + } + } + } + else + { + switch (op3->OperGet()) + { + case GT_LCL_FLD: + { + GenTreeLclFld* lclField = op3->AsLclFld(); + + varNum = lclField->GetLclNum(); + offset = lclField->gtLclFld.gtLclOffs; + break; + } + + case GT_LCL_VAR: + { + assert(op3->IsRegOptional() || !compiler->lvaTable[op3->gtLclVar.gtLclNum].lvIsRegCandidate()); + varNum = op3->AsLclVar()->GetLclNum(); + offset = 0; + break; + } + + default: + unreached(); + break; + } + } + + // Ensure we got a good varNum and offset. + // We also need to check for `tmpDsc != nullptr` since spill temp numbers + // are negative and start with -1, which also happens to be BAD_VAR_NUM. + assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); + assert(offset != (unsigned)-1); + + emit->emitIns_SIMD_R_R_R_S(ins, attr, targetReg, op1Reg, op2Reg, varNum, offset); + } + else + { + emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->gtRegNum); + } +} + // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics // with non-constant argument // @@ -1560,7 +1678,95 @@ void CodeGen::genBMI2Intrinsic(GenTreeHWIntrinsic* node) // void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node) { - NYI("Implement FMA intrinsic code generation"); + NamedIntrinsic intrinsicID = node->gtHWIntrinsicId; + var_types baseType = node->gtSIMDBaseType; + HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID); + emitAttr attr = EA_ATTR(node->gtSIMDSize); + instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType); + GenTree* op1 = node->gtGetOp1(); + regNumber targetReg = node->gtRegNum; + + assert(Compiler::numArgsOfHWIntrinsic(node) == 3); + assert(op1 != nullptr); + assert(op1->OperIsList()); + assert(op1->gtGetOp2()->OperIsList()); + assert(op1->gtGetOp2()->gtGetOp2()->OperIsList()); + + GenTreeArgList* argList = op1->AsArgList(); + op1 = argList->Current(); + genConsumeRegs(op1); + + argList = argList->Rest(); + GenTree* op2 = argList->Current(); + genConsumeRegs(op2); + + argList = argList->Rest(); + GenTree* op3 = argList->Current(); + genConsumeRegs(op3); + + regNumber op1Reg; + regNumber op2Reg; + + bool isCommutative = false; + bool copyUpperBits = (flags & HW_Flag_CopyUpperBits) != 0; + + // Intrinsics with CopyUpperBits semantics cannot have op1 be contained + assert(!copyUpperBits || !op1->isContained()); + + if (op3->isContained() || op3->isUsedFromSpillTemp()) + { + // 213 form: op1 = (op2 * op1) + [op3] + + op1Reg = op1->gtRegNum; + op2Reg = op2->gtRegNum; + + isCommutative = !copyUpperBits; + } + else if (op2->isContained() || op2->isUsedFromSpillTemp()) + { + // 132 form: op1 = (op1 * op3) + [op2] + + ins = (instruction)(ins - 1); + op1Reg = op1->gtRegNum; + op2Reg = op3->gtRegNum; + op3 = op2; + } + else if (op1->isContained() || op1->isUsedFromSpillTemp()) + { + // 231 form: op3 = (op2 * op3) + [op1] + + ins = (instruction)(ins + 1); + op1Reg = op3->gtRegNum; + op2Reg = op2->gtRegNum; + op3 = op1; + } + else + { + // 213 form: op1 = (op2 * op1) + op3 + + op1Reg = op1->gtRegNum; + op2Reg = op2->gtRegNum; + + isCommutative = !copyUpperBits; + } + + if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg)) + { + assert(node->isRMWHWIntrinsic(compiler)); + + // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic. + // + // For non-commutative intrinsics, we should have ensured that op2 was marked + // delay free in order to prevent it from getting assigned the same register + // as target. However, for commutative intrinsics, we can just swap the operands + // in order to have "reg2 = reg2 op reg1" which will end up producing the right code. + + op2Reg = op1Reg; + op1Reg = targetReg; + } + + genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3); + genProduceReg(node); } //------------------------------------------------------------------------ diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index c6450f45ac..48a6888dc1 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -476,6 +476,16 @@ HARDWARE_INTRINSIC(BMI2_IsSupported, "get_IsSuppo // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // FMA Intrinsics HARDWARE_INTRINSIC(FMA_IsSupported, "get_IsSupported", FMA, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(FMA_MultiplyAdd, "MultiplyAdd", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmadd213ps, INS_vfmadd213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize) +HARDWARE_INTRINSIC(FMA_MultiplyAddNegated, "MultiplyAddNegated", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmadd213ps, INS_vfnmadd213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize) +HARDWARE_INTRINSIC(FMA_MultiplyAddNegatedScalar, "MultiplyAddNegatedScalar", FMA, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmadd213ss, INS_vfnmadd213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(FMA_MultiplyAddScalar, "MultiplyAddScalar", FMA, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmadd213ss, INS_vfmadd213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(FMA_MultiplyAddSubtract, "MultiplyAddSubtract", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmaddsub213ps, INS_vfmaddsub213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize) +HARDWARE_INTRINSIC(FMA_MultiplySubtract, "MultiplySubtract", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmsub213ps, INS_vfmsub213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize) +HARDWARE_INTRINSIC(FMA_MultiplySubtractAdd, "MultiplySubtractAdd", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmsubadd213ps, INS_vfmsubadd213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize) +HARDWARE_INTRINSIC(FMA_MultiplySubtractNegated, "MultiplySubtractNegated", FMA, -1, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmsub213ps, INS_vfnmsub213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_UnfixedSIMDSize) +HARDWARE_INTRINSIC(FMA_MultiplySubtractScalar, "MultiplySubtractScalar", FMA, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmsub213ss, INS_vfmsub213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(FMA_MultiplySubtractNegatedScalar, "MultiplySubtractNegatedScalar", FMA, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmsub213ss, INS_vfnmsub213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_CopyUpperBits) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp index a6a5f1452c..d3cdbb33df 100644 --- a/src/jit/hwintrinsicxarch.cpp +++ b/src/jit/hwintrinsicxarch.cpp @@ -547,7 +547,6 @@ bool Compiler::isFullyImplmentedISAClass(InstructionSet isa) case InstructionSet_AES: case InstructionSet_BMI1: case InstructionSet_BMI2: - case InstructionSet_FMA: case InstructionSet_PCLMULQDQ: return false; @@ -563,6 +562,7 @@ bool Compiler::isFullyImplmentedISAClass(InstructionSet isa) case InstructionSet_SSE3: case InstructionSet_SSSE3: case InstructionSet_SSE41: + case InstructionSet_FMA: case InstructionSet_LZCNT: case InstructionSet_POPCNT: return true; diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index fa93ad217c..489baa8ec3 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -499,6 +499,71 @@ INST3(vbroadcastf128,"broadcastf128",0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3(vbroadcasti128,"broadcasti128",0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x5A)) // Broadcast packed integer values read from memory to entire ymm register INST3(vmaskmovps, "maskmovps" ,0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2C)) // Conditional SIMD Packed Loads Float INST3(vmaskmovpd, "maskmovpd" ,0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2D)) // Conditional SIMD Packed Loads Double + +INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) +// enum name FP updmode rf wf MR MI RM +INST3(vfmadd132pd, "fmadd132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x98)) // Fused Multiply-Add of Packed Double-Precision Floating-Point Values +INST3(vfmadd213pd, "fmadd213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA8)) // +INST3(vfmadd231pd, "fmadd231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB8)) // +INST3(vfmadd132ps, "fmadd132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x98)) // Fused Multiply-Add of Packed Single-Precision Floating-Point Values +INST3(vfmadd213ps, "fmadd213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA8)) // +INST3(vfmadd231ps, "fmadd231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB8)) // +INST3(vfmadd132sd, "fmadd132sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x99)) // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values +INST3(vfmadd213sd, "fmadd213sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA9)) // +INST3(vfmadd231sd, "fmadd231sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB9)) // +INST3(vfmadd132ss, "fmadd132ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x99)) // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values +INST3(vfmadd213ss, "fmadd213ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA9)) // +INST3(vfmadd231ss, "fmadd231ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB9)) // +INST3(vfmaddsub132pd, "fmaddsub132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x96)) // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values +INST3(vfmaddsub213pd, "fmaddsub213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA6)) // +INST3(vfmaddsub231pd, "fmaddsub231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB6)) // +INST3(vfmaddsub132ps, "fmaddsub132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x96)) // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values +INST3(vfmaddsub213ps, "fmaddsub213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA6)) // +INST3(vfmaddsub231ps, "fmaddsub231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB6)) // +INST3(vfmsubadd132pd, "fmsubadd132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x97)) // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values +INST3(vfmsubadd213pd, "fmsubadd213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA7)) // +INST3(vfmsubadd231pd, "fmsubadd231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB7)) // +INST3(vfmsubadd132ps, "fmsubadd132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x97)) // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values +INST3(vfmsubadd213ps, "fmsubadd213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xA7)) // +INST3(vfmsubadd231ps, "fmsubadd231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xB7)) // +INST3(vfmsub132pd, "fmsub132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9A)) // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values +INST3(vfmsub213pd, "fmsub213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAA)) // +INST3(vfmsub231pd, "fmsub231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBA)) // +INST3(vfmsub132ps, "fmsub132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9A)) // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values +INST3(vfmsub213ps, "fmsub213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAA)) // +INST3(vfmsub231ps, "fmsub231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBA)) // +INST3(vfmsub132sd, "fmsub132sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9B)) // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values +INST3(vfmsub213sd, "fmsub213sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAB)) // +INST3(vfmsub231sd, "fmsub231sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBB)) // +INST3(vfmsub132ss, "fmsub132ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9B)) // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values +INST3(vfmsub213ss, "fmsub213ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAB)) // +INST3(vfmsub231ss, "fmsub231ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBB)) // +INST3(vfnmadd132pd, "fmnadd132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9C)) // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values +INST3(vfnmadd213pd, "fmnadd213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAC)) // +INST3(vfnmadd231pd, "fmnadd231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBC)) // +INST3(vfnmadd132ps, "fmnadd132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9C)) // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values +INST3(vfnmadd213ps, "fmnadd213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAC)) // +INST3(vfnmadd231ps, "fmnadd231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBC)) // +INST3(vfnmadd132sd, "fmnadd132sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9D)) // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values +INST3(vfnmadd213sd, "fmnadd213sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAD)) // +INST3(vfnmadd231sd, "fmnadd231sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBD)) // +INST3(vfnmadd132ss, "fmnadd132ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9D)) // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values +INST3(vfnmadd213ss, "fmnadd213ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAD)) // +INST3(vfnmadd231ss, "fmnadd231ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBD)) // +INST3(vfnmsub132pd, "fmnsub132pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9E)) // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values +INST3(vfnmsub213pd, "fmnsub213pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAE)) // +INST3(vfnmsub231pd, "fmnsub231pd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBE)) // +INST3(vfnmsub132ps, "fmnsub132ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9E)) // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values +INST3(vfnmsub213ps, "fmnsub213ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAE)) // +INST3(vfnmsub231ps, "fmnsub231ps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBE)) // +INST3(vfnmsub132sd, "fmnsub132sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9F)) // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values +INST3(vfnmsub213sd, "fmnsub213sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAF)) // +INST3(vfnmsub231sd, "fmnsub231sd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBF)) // +INST3(vfnmsub132ss, "fmnsub132ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x9F)) // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values +INST3(vfnmsub213ss, "fmnsub213ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xAF)) // +INST3(vfnmsub231ss, "fmnsub231ss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0xBF)) // +INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) + INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) // Scalar instructions in SSE4.2 diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index 0bf0c241a5..d53d3a5aa0 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2410,6 +2410,51 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } } + else if ((intrinsicID >= NI_FMA_MultiplyAdd) && (intrinsicID <= NI_FMA_MultiplySubtractNegatedScalar)) + { + assert(numArgs == 3); + assert(op1->OperIsList()); + + GenTreeArgList* argList = op1->AsArgList(); + op1 = argList->Current(); + + argList = argList->Rest(); + GenTree* op2 = argList->Current(); + + argList = argList->Rest(); + GenTree* op3 = argList->Current(); + + if (IsContainableHWIntrinsicOp(node, op3)) + { + // 213 form: op1 = (op2 * op1) + [op3] + MakeSrcContained(node, op3); + } + else if (IsContainableHWIntrinsicOp(node, op2)) + { + // 132 form: op1 = (op1 * op3) + [op2] + MakeSrcContained(node, op2); + } + else if (IsContainableHWIntrinsicOp(node, op1)) + { + // Intrinsics with CopyUpperBits semantics cannot have op1 be contained + + if ((flags & HW_Flag_CopyUpperBits) == 0) + { + // 231 form: op3 = (op2 * op3) + [op1] + MakeSrcContained(node, op1); + } + } + else + { + // TODO-XArch-CQ: Technically any one of the three operands can + // be reg-optional. With a limitation on op1 where + // it can only be so if CopyUpperBits is off. + // https://github.com/dotnet/coreclr/issues/6361 + + // 213 form: op1 = (op2 * op1) + op3 + op3->SetRegOptional(); + } + } if (Compiler::categoryOfHWIntrinsic(intrinsicID) == HW_Category_IMM) { diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index 3c5157d80f..96af2f6d8a 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -2508,6 +2508,94 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) } #endif // _TARGET_X86_ + case NI_FMA_MultiplyAdd: + case NI_FMA_MultiplyAddNegated: + case NI_FMA_MultiplyAddNegatedScalar: + case NI_FMA_MultiplyAddScalar: + case NI_FMA_MultiplyAddSubtract: + case NI_FMA_MultiplySubtract: + case NI_FMA_MultiplySubtractAdd: + case NI_FMA_MultiplySubtractNegated: + case NI_FMA_MultiplySubtractNegatedScalar: + case NI_FMA_MultiplySubtractScalar: + { + assert(numArgs == 3); + assert(isRMW); + + bool copyUpperBits = (flags & HW_Flag_CopyUpperBits) != 0; + + // Intrinsics with CopyUpperBits semantics cannot have op1 be contained + assert(!copyUpperBits || !op1->isContained()); + + if (op3->isContained()) + { + // 213 form: op1 = (op2 * op1) + [op3] + + if (copyUpperBits) + { + tgtPrefUse = BuildUse(op1); + + srcCount += 1; + srcCount += BuildDelayFreeUses(op2); + } + else + { + // op1 and op2 are commutative, so don't + // set either to be tgtPref or delayFree + + srcCount += BuildOperandUses(op1); + srcCount += BuildOperandUses(op2); + } + + srcCount += BuildOperandUses(op3); + } + else if (op2->isContained()) + { + // 132 form: op1 = (op1 * op3) + [op2] + + tgtPrefUse = BuildUse(op1); + + srcCount += 1; + srcCount += BuildOperandUses(op2); + srcCount += BuildDelayFreeUses(op3); + } + else if (op1->isContained()) + { + // 231 form: op3 = (op2 * op3) + [op1] + + tgtPrefUse = BuildUse(op3); + + srcCount += BuildOperandUses(op1); + srcCount += BuildDelayFreeUses(op2); + srcCount += 1; + } + else + { + // 213 form: op1 = (op2 * op1) + op3 + + if (copyUpperBits) + { + tgtPrefUse = BuildUse(op1); + + srcCount += 1; + srcCount += BuildDelayFreeUses(op2); + } + else + { + // op1 and op2 are commutative, so don't + // set either to be tgtPref or delayFree + + srcCount += BuildOperandUses(op1); + srcCount += BuildOperandUses(op2); + } + + srcCount += BuildDelayFreeUses(op3); + } + + buildUses = false; + break; + } + default: { assert((intrinsicID > NI_HW_INTRINSIC_START) && (intrinsicID < NI_HW_INTRINSIC_END)); |