From 7ba1bf921700a66ee2e45ce5f706b3366ee493ba Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sat, 28 Oct 2017 09:35:28 -0700 Subject: Adding SSE4.1 intrinsic support for Round, Ceiling, and Floor. --- src/jit/codegen.h | 3 + src/jit/codegenxarch.cpp | 173 +++++++++++++++ src/jit/compiler.h | 8 +- src/jit/emitfmtsxarch.h | 3 + src/jit/emitxarch.cpp | 565 +++++++++++++++++++++++++++++++++++++++++------ src/jit/emitxarch.h | 6 + src/jit/importer.cpp | 8 +- src/jit/instr.cpp | 3 +- src/jit/instrsxarch.h | 5 +- src/jit/lowerxarch.cpp | 6 +- src/jit/lsraxarch.cpp | 11 +- src/jit/rationalize.cpp | 4 +- src/jit/stackfp.cpp | 2 +- src/jit/valuenum.cpp | 2 +- 14 files changed, 713 insertions(+), 86 deletions(-) (limited to 'src') diff --git a/src/jit/codegen.h b/src/jit/codegen.h index d541a19014..bb8bc50f81 100644 --- a/src/jit/codegen.h +++ b/src/jit/codegen.h @@ -65,6 +65,9 @@ private: // Generates SSE2 code for the given tree as "Operand BitWiseOp BitMask" void genSSE2BitwiseOp(GenTreePtr treeNode); + + // Generates SSE41 code for the given tree as a round operation + void genSSE41RoundOp(GenTreeOp* treeNode); #endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87 void genPrepForCompiler(); diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index 01ba9c2647..1e99da45bd 100644 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -7331,6 +7331,173 @@ void CodeGen::genSSE2BitwiseOp(GenTreePtr treeNode) inst_RV_RV(ins, targetReg, operandReg, targetType); } +//----------------------------------------------------------------------------------------- +// genSSE41RoundOp - generate SSE41 code for the given tree as a round operation +// +// Arguments: +// treeNode - tree node +// +// Return value: +// None +// +// Assumptions: +// i) SSE4.1 is supported by the underlying hardware +// ii) treeNode oper is a GT_INTRINSIC +// iii) treeNode type is a floating point type +// iv) treeNode is not used from memory +// v) tree oper is CORINFO_INTRINSIC_Round, _Ceiling, or _Floor +// vi) caller of this routine needs to call genProduceReg() +void CodeGen::genSSE41RoundOp(GenTreeOp* treeNode) +{ + // i) SSE4.1 is supported by the underlying hardware + assert(compiler->compSupports(InstructionSet_SSE41)); + + // ii) treeNode oper is a GT_INTRINSIC + assert(treeNode->OperGet() == GT_INTRINSIC); + + GenTree* srcNode = treeNode->gtGetOp1(); + + // iii) treeNode type is floating point type + assert(varTypeIsFloating(srcNode)); + assert(srcNode->TypeGet() == treeNode->TypeGet()); + + // iv) treeNode is not used from memory + assert(!treeNode->isUsedFromMemory()); + + genConsumeOperands(treeNode); + + instruction ins = (treeNode->TypeGet() == TYP_FLOAT) ? INS_roundss : INS_roundsd; + emitAttr size = emitTypeSize(treeNode); + + regNumber dstReg = treeNode->gtRegNum; + + unsigned ival = 0; + + // v) tree oper is CORINFO_INTRINSIC_Round, _Ceiling, or _Floor + switch (treeNode->gtIntrinsic.gtIntrinsicId) + { + case CORINFO_INTRINSIC_Round: + ival = 4; + break; + + case CORINFO_INTRINSIC_Ceiling: + ival = 10; + break; + + case CORINFO_INTRINSIC_Floor: + ival = 9; + break; + + default: + ins = INS_invalid; + assert(!"genSSE41RoundOp: unsupported intrinsic"); + unreached(); + } + + if (srcNode->isContained() || srcNode->isUsedFromSpillTemp()) + { + emitter* emit = getEmitter(); + + TempDsc* tmpDsc = nullptr; + unsigned varNum = BAD_VAR_NUM; + unsigned offset = (unsigned)-1; + + if (srcNode->isUsedFromSpillTemp()) + { + assert(srcNode->IsRegOptional()); + + tmpDsc = getSpillTempDsc(srcNode); + varNum = tmpDsc->tdTempNum(); + offset = 0; + + compiler->tmpRlsTemp(tmpDsc); + } + else if (srcNode->isIndir()) + { + GenTreeIndir* memIndir = srcNode->AsIndir(); + GenTree* memBase = memIndir->gtOp1; + + switch (memBase->OperGet()) + { + case GT_LCL_VAR_ADDR: + { + varNum = memBase->AsLclVarCommon()->GetLclNum(); + offset = 0; + + // Ensure that all the GenTreeIndir values are set to their defaults. + assert(memBase->gtRegNum == REG_NA); + assert(!memIndir->HasIndex()); + assert(memIndir->Scale() == 1); + assert(memIndir->Offset() == 0); + + break; + } + + case GT_CLS_VAR_ADDR: + { + emit->emitIns_R_C_I(ins, size, dstReg, memBase->gtClsVar.gtClsVarHnd, 0, ival); + return; + } + + default: + { + emit->emitIns_R_A_I(ins, size, dstReg, memIndir, ival); + return; + } + } + } + else + { + switch (srcNode->OperGet()) + { + case GT_CNS_DBL: + { + GenTreeDblCon* dblConst = srcNode->AsDblCon(); + CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(dblConst->gtDconVal, emitTypeSize(dblConst)); + + emit->emitIns_R_C_I(ins, size, dstReg, hnd, 0, ival); + return; + } + + case GT_LCL_FLD: + { + GenTreeLclFld* lclField = srcNode->AsLclFld(); + + varNum = lclField->GetLclNum(); + offset = lclField->gtLclFld.gtLclOffs; + break; + } + + case GT_LCL_VAR: + { + assert(srcNode->IsRegOptional() || + !compiler->lvaTable[srcNode->gtLclVar.gtLclNum].lvIsRegCandidate()); + + varNum = srcNode->AsLclVar()->GetLclNum(); + offset = 0; + break; + } + + default: + unreached(); + break; + } + } + + // Ensure we got a good varNum and offset. + // We also need to check for `tmpDsc != nullptr` since spill temp numbers + // are negative and start with -1, which also happens to be BAD_VAR_NUM. + assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); + assert(offset != (unsigned)-1); + + emit->emitIns_R_S_I(ins, size, dstReg, varNum, offset, ival); + } + else + { + inst_RV_RV_IV(ins, size, dstReg, srcNode->gtRegNum, ival); + } +} + //--------------------------------------------------------------------- // genIntrinsic - generate code for a given intrinsic // @@ -7361,6 +7528,12 @@ void CodeGen::genIntrinsic(GenTreePtr treeNode) genSSE2BitwiseOp(treeNode); break; + case CORINFO_INTRINSIC_Round: + case CORINFO_INTRINSIC_Ceiling: + case CORINFO_INTRINSIC_Floor: + genSSE41RoundOp(treeNode->AsOp()); + break; + default: assert(!"genIntrinsic: Unsupported intrinsic"); unreached(); diff --git a/src/jit/compiler.h b/src/jit/compiler.h index c0123f9c72..0679d3199e 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -3222,10 +3222,10 @@ public: unsigned* typeSize, bool forReturn); - static bool IsIntrinsicImplementedByUserCall(CorInfoIntrinsics intrinsicId); - static bool IsTargetIntrinsic(CorInfoIntrinsics intrinsicId); - static bool IsMathIntrinsic(CorInfoIntrinsics intrinsicId); - static bool IsMathIntrinsic(GenTreePtr tree); + bool IsIntrinsicImplementedByUserCall(CorInfoIntrinsics intrinsicId); + bool IsTargetIntrinsic(CorInfoIntrinsics intrinsicId); + bool IsMathIntrinsic(CorInfoIntrinsics intrinsicId); + bool IsMathIntrinsic(GenTreePtr tree); private: //----------------- Importing the method ---------------------------------- diff --git a/src/jit/emitfmtsxarch.h b/src/jit/emitfmtsxarch.h index f00e64fb20..d0aef507e0 100644 --- a/src/jit/emitfmtsxarch.h +++ b/src/jit/emitfmtsxarch.h @@ -122,6 +122,7 @@ IF_DEF(MRD_OFF, IS_GM_RD, DSP) // offset mem IF_DEF(RRD_MRD, IS_GM_RD|IS_R1_RD, DSP) // read reg , read [mem] IF_DEF(RWR_MRD, IS_GM_RD|IS_R1_WR, DSP) // write reg , read [mem] IF_DEF(RRW_MRD, IS_GM_RD|IS_R1_RW, DSP) // r/w reg , read [mem] +IF_DEF(RRW_MRD_CNS, IS_GM_RD|IS_R1_RW, DSP_CNS) // r/w reg , read [mem], const IF_DEF(RWR_RRD_MRD, IS_GM_RD|IS_R1_WR|IS_R2_RD, DSP) // write reg , read reg2 , read [mem] IF_DEF(RWR_MRD_OFF, IS_GM_RD|IS_R1_WR, DSP) // write reg , offset mem @@ -147,6 +148,7 @@ IF_DEF(SRW, IS_SF_RW, NONE) // r/w [stk] IF_DEF(RRD_SRD, IS_SF_RD|IS_R1_RD, NONE) // read reg , read [stk] IF_DEF(RWR_SRD, IS_SF_RD|IS_R1_WR, NONE) // write reg , read [stk] IF_DEF(RRW_SRD, IS_SF_RD|IS_R1_RW, NONE) // r/w reg , read [stk] +IF_DEF(RRW_SRD_CNS, IS_SF_RD|IS_R1_RW, CNS ) // r/w reg , read [stk], const IF_DEF(RWR_RRD_SRD, IS_SF_RD|IS_R1_WR|IS_R2_RD, NONE) // write reg , read reg2, read [stk] @@ -172,6 +174,7 @@ IF_DEF(ARW, IS_AM_RW, AMD ) // r/w [adr] IF_DEF(RRD_ARD, IS_AM_RD|IS_R1_RD, AMD ) // read reg , read [adr] IF_DEF(RWR_ARD, IS_AM_RD|IS_R1_WR, AMD ) // write reg , read [adr] IF_DEF(RRW_ARD, IS_AM_RD|IS_R1_RW, AMD ) // r/w reg , read [adr] +IF_DEF(RRW_ARD_CNS, IS_AM_RD|IS_R1_RW, AMD_CNS) // r/w reg , read [adr], const IF_DEF(RWR_RRD_ARD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD ) // write reg , read reg2, read [adr] diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index 2f7f8e615d..d64a9ca093 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -193,6 +193,8 @@ bool emitter::IsDstSrcSrcAVXInstruction(instruction ins) case INS_movlpd: case INS_movlps: case INS_movss: + case INS_roundsd: + case INS_roundss: case INS_sqrtsd: case INS_sqrtss: return IsAVXInstruction(ins); @@ -2588,6 +2590,8 @@ emitter::insFormat emitter::emitMapFmtAtoM(insFormat fmt) return IF_RWR_MRD; case IF_RRW_ARD: return IF_RRW_MRD; + case IF_RRW_ARD_CNS: + return IF_RRW_MRD_CNS; case IF_RWR_RRD_ARD: return IF_RWR_RRD_MRD; @@ -3908,6 +3912,94 @@ void emitter::emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTre emitCurIGsize += sz; } +void emitter::emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir, int ival) +{ + noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1)); + + ssize_t offs = indir->Offset(); + instrDesc* id = emitNewInstrAmdCns(attr, offs, ival); + + id->idIns(ins); + id->idReg1(reg1); + + emitHandleMemOp(indir, id, IF_RRW_ARD_CNS, ins); + + // Plus one for the 1-byte immediate (ival) + UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins)) + emitGetVexPrefixAdjustedSize(ins, attr, insCodeRM(ins)) + 1; + id->idCodeSize(sz); + + dispIns(id); + emitCurIGsize += sz; +} + +void emitter::emitIns_R_C_I( + instruction ins, emitAttr attr, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival) +{ + // Static always need relocs + if (!jitStaticFldIsGlobAddr(fldHnd)) + { + attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG); + } + + noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1)); + + instrDesc* id = emitNewInstrCnsDsp(attr, ival, offs); + UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins)); + + // Plus one for the 1 byte immediate (ival) + sz += 1; + + // VEX prefix + sz += emitGetVexPrefixAdjustedSize(ins, attr, insCodeRM(ins)); + + // REX prefix + if (IsExtendedReg(reg1, attr)) + { + sz += emitGetRexPrefixSize(ins); + } + + id->idIns(ins); + id->idInsFmt(IF_RRW_MRD_CNS); + id->idReg1(reg1); + id->idAddr()->iiaFieldHnd = fldHnd; + id->idCodeSize(sz); + + dispIns(id); + emitCurIGsize += sz; +} + +void emitter::emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival) +{ + noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1)); + + instrDesc* id = emitNewInstrCns(attr, ival); + UNATIVE_OFFSET sz = emitInsSizeSV(insCodeRM(ins), varx, offs); + + // Plus one for the 1 byte immediate (ival) + sz += 1; + + // VEX prefix + sz += emitGetVexPrefixAdjustedSize(ins, attr, insCodeRM(ins)); + + // REX prefix + if (IsExtendedReg(reg1, attr)) + { + sz += emitGetRexPrefixSize(ins); + } + + id->idIns(ins); + id->idInsFmt(IF_RRW_SRD_CNS); + id->idReg1(reg1); + id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs); + id->idCodeSize(sz); +#ifdef DEBUG + id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs; +#endif + + dispIns(id); + emitCurIGsize += sz; +} + void emitter::emitIns_R_R_A( instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, insFormat fmt) { @@ -3952,6 +4044,7 @@ void emitter::emitIns_R_R_C( id->idAddr()->iiaFieldHnd = fldHnd; id->idCodeSize(sz); + dispIns(id); emitCurIGsize += sz; } @@ -7059,6 +7152,24 @@ void emitter::emitDispIns( emitDispAddrMode(id); break; + case IF_RRW_ARD_CNS: + printf("%s, %s", emitRegName(id->idReg1(), attr), sstr); + emitDispAddrMode(id); + + emitGetInsAmdCns(id, &cnsVal); + val = cnsVal.cnsVal; + + printf(", "); + if (cnsVal.cnsReloc) + { + emitDispReloc(val); + } + else + { + goto PRINT_CONSTANT; + } + break; + case IF_RWR_RRD_ARD: printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr); emitDispAddrMode(id); @@ -7207,6 +7318,25 @@ void emitter::emitDispIns( break; + case IF_RRW_SRD_CNS: + printf("%s, %s", emitRegName(id->idReg1(), attr), sstr); + emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), + id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + + emitGetInsCns(id, &cnsVal); + val = cnsVal.cnsVal; + + printf(", "); + if (cnsVal.cnsReloc) + { + emitDispReloc(val); + } + else + { + goto PRINT_CONSTANT; + } + break; + case IF_RWR_RRD_SRD: printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr); emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), @@ -7341,6 +7471,25 @@ void emitter::emitDispIns( emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); break; + case IF_RRW_MRD_CNS: + printf("%s, %s", emitRegName(id->idReg1(), attr), sstr); + offs = emitGetInsDsp(id); + emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); + + emitGetInsDcmCns(id, &cnsVal); + val = cnsVal.cnsVal; + + printf(", "); + if (cnsVal.cnsReloc) + { + emitDispReloc(val); + } + else + { + goto PRINT_CONSTANT; + } + break; + case IF_RWR_RRD_MRD: printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr); offs = emitGetInsDsp(id); @@ -7793,7 +7942,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Therefore, add VEX prefix is one is not already present. code = AddVexPrefixIfNeededAndNotPresent(ins, code, size); - if (IsDstDstSrcAVXInstruction(ins)) + // For this format, moves do not support a third operand, so we only need to handle the binary ops. + if (IsDstDstSrcAVXInstruction(ins) && !Is4ByteAVXInstruction(ins)) { regNumber src1 = id->idReg2(); @@ -7826,8 +7976,19 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) rgx = (regNumber)RegEncoding(rgx); } + // Special case emitting AVX instructions + if (Is4ByteAVXInstruction(ins)) + { + unsigned regcode = insEncodeReg345(ins, id->idReg1(), size, &code); + dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + + // Emit last opcode byte + assert((code & 0xFF) == 0); + dst += emitOutputByte(dst, (code >> 8) & 0xFF); + code = regcode; + } // Is this a 'big' opcode? - if (code & 0xFF000000) + else if (code & 0xFF000000) { // Output the REX prefix dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); @@ -7949,7 +8110,14 @@ GOT_DSP: // The address is of the form "[disp]" // On x86 - disp is relative to zero // On Amd64 - disp is relative to RIP - dst += emitOutputWord(dst, code | 0x0500); + if (Is4ByteAVXInstruction(ins)) + { + dst += emitOutputByte(dst, code | 0x05); + } + else + { + dst += emitOutputWord(dst, code | 0x0500); + } if (addc) { @@ -7998,7 +8166,14 @@ GOT_DSP: else { #ifdef _TARGET_X86_ - dst += emitOutputWord(dst, code | 0x0500); + if (Is4ByteAVXInstruction(ins)) + { + dst += emitOutputByte(dst, code | 0x05); + } + else + { + dst += emitOutputWord(dst, code | 0x0500); + } #else //_TARGET_AMD64_ // Amd64: addr fits within 32-bits and can be encoded as a displacement relative to zero. // This addr mode should never be used while generating relocatable ngen code nor if @@ -8008,7 +8183,14 @@ GOT_DSP: noway_assert((int)dsp == dsp); // This requires, specifying a SIB byte after ModRM byte. - dst += emitOutputWord(dst, code | 0x0400); + if (Is4ByteAVXInstruction(ins)) + { + dst += emitOutputByte(dst, code | 0x04); + } + else + { + dst += emitOutputWord(dst, code | 0x0400); + } dst += emitOutputByte(dst, 0x25); #endif //_TARGET_AMD64_ dst += emitOutputLong(dst, dsp); @@ -8016,20 +8198,42 @@ GOT_DSP: break; case REG_EBP: - // Does the offset fit in a byte? - if (dspInByte) + if (Is4ByteAVXInstruction(ins)) { - dst += emitOutputWord(dst, code | 0x4500); - dst += emitOutputByte(dst, dsp); + // Does the offset fit in a byte? + if (dspInByte) + { + dst += emitOutputByte(dst, code | 0x45); + dst += emitOutputByte(dst, dsp); + } + else + { + dst += emitOutputByte(dst, code | 0x85); + dst += emitOutputLong(dst, dsp); + + if (id->idIsDspReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + } + } } else { - dst += emitOutputWord(dst, code | 0x8500); - dst += emitOutputLong(dst, dsp); - - if (id->idIsDspReloc()) + // Does the offset fit in a byte? + if (dspInByte) + { + dst += emitOutputWord(dst, code | 0x4500); + dst += emitOutputByte(dst, dsp); + } + else { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + dst += emitOutputWord(dst, code | 0x8500); + dst += emitOutputLong(dst, dsp); + + if (id->idIsDspReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + } } } break; @@ -8048,55 +8252,116 @@ GOT_DSP: (ins == INS_or)); #endif // LEGACY_BACKEND - // Is the offset 0 or does it at least fit in a byte? - if (dspIsZero) - { - dst += emitOutputWord(dst, code | 0x0400); - dst += emitOutputByte(dst, 0x24); - } - else if (dspInByte) + if (Is4ByteAVXInstruction(ins)) { - dst += emitOutputWord(dst, code | 0x4400); - dst += emitOutputByte(dst, 0x24); - dst += emitOutputByte(dst, dsp); + // Is the offset 0 or does it at least fit in a byte? + if (dspIsZero) + { + dst += emitOutputByte(dst, code | 0x04); + dst += emitOutputByte(dst, 0x24); + } + else if (dspInByte) + { + dst += emitOutputByte(dst, code | 0x44); + dst += emitOutputByte(dst, 0x24); + dst += emitOutputByte(dst, dsp); + } + else + { + dst += emitOutputByte(dst, code | 0x84); + dst += emitOutputByte(dst, 0x24); + dst += emitOutputLong(dst, dsp); + if (id->idIsDspReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + } + } } else { - dst += emitOutputWord(dst, code | 0x8400); - dst += emitOutputByte(dst, 0x24); - dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) + // Is the offset 0 or does it at least fit in a byte? + if (dspIsZero) { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + dst += emitOutputWord(dst, code | 0x0400); + dst += emitOutputByte(dst, 0x24); + } + else if (dspInByte) + { + dst += emitOutputWord(dst, code | 0x4400); + dst += emitOutputByte(dst, 0x24); + dst += emitOutputByte(dst, dsp); + } + else + { + dst += emitOutputWord(dst, code | 0x8400); + dst += emitOutputByte(dst, 0x24); + dst += emitOutputLong(dst, dsp); + if (id->idIsDspReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + } } } break; default: - // Put the register in the opcode - code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) << 8; - - // Is there a displacement? - if (dspIsZero) + if (Is4ByteAVXInstruction(ins)) { - // This is simply "[reg]" - dst += emitOutputWord(dst, code); + // Put the register in the opcode + code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr); + + // Is there a displacement? + if (dspIsZero) + { + // This is simply "[reg]" + dst += emitOutputByte(dst, code); + } + else + { + // This is [reg + dsp]" -- does the offset fit in a byte? + if (dspInByte) + { + dst += emitOutputByte(dst, code | 0x40); + dst += emitOutputByte(dst, dsp); + } + else + { + dst += emitOutputByte(dst, code | 0x80); + dst += emitOutputLong(dst, dsp); + if (id->idIsDspReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + } + } + } } else { - // This is [reg + dsp]" -- does the offset fit in a byte? - if (dspInByte) + // Put the register in the opcode + code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) << 8; + + // Is there a displacement? + if (dspIsZero) { - dst += emitOutputWord(dst, code | 0x4000); - dst += emitOutputByte(dst, dsp); + // This is simply "[reg]" + dst += emitOutputWord(dst, code); } else { - dst += emitOutputWord(dst, code | 0x8000); - dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) + // This is [reg + dsp]" -- does the offset fit in a byte? + if (dspInByte) { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + dst += emitOutputWord(dst, code | 0x4000); + dst += emitOutputByte(dst, dsp); + } + else + { + dst += emitOutputWord(dst, code | 0x8000); + dst += emitOutputLong(dst, dsp); + if (id->idIsDspReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + } } } } @@ -8121,30 +8386,63 @@ GOT_DSP: regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) | insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); - // Emit [ebp + {2/4/8} * rgz] as [ebp + {2/4/8} * rgx + 0] - if (dspIsZero && reg != REG_EBP) + if (Is4ByteAVXInstruction(ins)) { - // The address is "[reg + {2/4/8} * rgx]" - dst += emitOutputWord(dst, code | 0x0400); - dst += emitOutputByte(dst, regByte); + // Emit [ebp + {2/4/8} * rgz] as [ebp + {2/4/8} * rgx + 0] + if (dspIsZero && reg != REG_EBP) + { + // The address is "[reg + {2/4/8} * rgx]" + dst += emitOutputByte(dst, code | 0x04); + dst += emitOutputByte(dst, regByte); + } + else + { + // The address is "[reg + {2/4/8} * rgx + disp]" + if (dspInByte) + { + dst += emitOutputByte(dst, code | 0x44); + dst += emitOutputByte(dst, regByte); + dst += emitOutputByte(dst, dsp); + } + else + { + dst += emitOutputByte(dst, code | 0x84); + dst += emitOutputByte(dst, regByte); + dst += emitOutputLong(dst, dsp); + if (id->idIsDspReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + } + } + } } else { - // The address is "[reg + {2/4/8} * rgx + disp]" - if (dspInByte) + // Emit [ebp + {2/4/8} * rgz] as [ebp + {2/4/8} * rgx + 0] + if (dspIsZero && reg != REG_EBP) { - dst += emitOutputWord(dst, code | 0x4400); + // The address is "[reg + {2/4/8} * rgx]" + dst += emitOutputWord(dst, code | 0x0400); dst += emitOutputByte(dst, regByte); - dst += emitOutputByte(dst, dsp); } else { - dst += emitOutputWord(dst, code | 0x8400); - dst += emitOutputByte(dst, regByte); - dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) + // The address is "[reg + {2/4/8} * rgx + disp]" + if (dspInByte) { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + dst += emitOutputWord(dst, code | 0x4400); + dst += emitOutputByte(dst, regByte); + dst += emitOutputByte(dst, dsp); + } + else + { + dst += emitOutputWord(dst, code | 0x8400); + dst += emitOutputByte(dst, regByte); + dst += emitOutputLong(dst, dsp); + if (id->idIsDspReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + } } } } @@ -8155,7 +8453,15 @@ GOT_DSP: regByte = insEncodeReg012(ins, REG_EBP, EA_PTRSIZE, nullptr) | insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); - dst += emitOutputWord(dst, code | 0x0400); + if (Is4ByteAVXInstruction(ins)) + { + dst += emitOutputByte(dst, code | 0x04); + } + else + { + dst += emitOutputWord(dst, code | 0x0400); + } + dst += emitOutputByte(dst, regByte); // Special case: jump through a jump table @@ -8176,29 +8482,61 @@ GOT_DSP: // The address is "[reg+rgx+dsp]" regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) | insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr); - if (dspIsZero && reg != REG_EBP) + if (Is4ByteAVXInstruction(ins)) { - // This is [reg+rgx]" - dst += emitOutputWord(dst, code | 0x0400); - dst += emitOutputByte(dst, regByte); + if (dspIsZero && reg != REG_EBP) + { + // This is [reg+rgx]" + dst += emitOutputByte(dst, code | 0x04); + dst += emitOutputByte(dst, regByte); + } + else + { + // This is [reg+rgx+dsp]" -- does the offset fit in a byte? + if (dspInByte) + { + dst += emitOutputByte(dst, code | 0x44); + dst += emitOutputByte(dst, regByte); + dst += emitOutputByte(dst, dsp); + } + else + { + dst += emitOutputByte(dst, code | 0x84); + dst += emitOutputByte(dst, regByte); + dst += emitOutputLong(dst, dsp); + if (id->idIsDspReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + } + } + } } else { - // This is [reg+rgx+dsp]" -- does the offset fit in a byte? - if (dspInByte) + if (dspIsZero && reg != REG_EBP) { - dst += emitOutputWord(dst, code | 0x4400); + // This is [reg+rgx]" + dst += emitOutputWord(dst, code | 0x0400); dst += emitOutputByte(dst, regByte); - dst += emitOutputByte(dst, dsp); } else { - dst += emitOutputWord(dst, code | 0x8400); - dst += emitOutputByte(dst, regByte); - dst += emitOutputLong(dst, dsp); - if (id->idIsDspReloc()) + // This is [reg+rgx+dsp]" -- does the offset fit in a byte? + if (dspInByte) { - emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + dst += emitOutputWord(dst, code | 0x4400); + dst += emitOutputByte(dst, regByte); + dst += emitOutputByte(dst, dsp); + } + else + { + dst += emitOutputWord(dst, code | 0x8400); + dst += emitOutputByte(dst, regByte); + dst += emitOutputLong(dst, dsp); + if (id->idIsDspReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)dsp, IMAGE_REL_BASED_HIGHLOW); + } } } } @@ -11148,6 +11486,25 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } break; + case IF_RRW_ARD_CNS: + emitGetInsAmdCns(id, &cnsVal); + code = insCodeRM(ins); + + // Special case 4-byte AVX instructions + if (Is4ByteAVXInstruction(ins)) + { + dst = emitOutputAM(dst, id, code, &cnsVal); + } + else + { + code = AddVexPrefixIfNeeded(ins, code, size); + regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + dst = emitOutputAM(dst, id, code | regcode, &cnsVal); + } + + sz = emitSizeOfInsDsc(id); + break; + case IF_RRD_ARD: case IF_RWR_ARD: case IF_RRW_ARD: @@ -11243,6 +11600,38 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) sz = emitSizeOfInsDsc(id); break; + case IF_RRW_SRD_CNS: + emitGetInsCns(id, &cnsVal); + code = insCodeRM(ins); + + // Special case 4-byte AVX instructions + if (Is4ByteAVXInstruction(ins)) + { + dst = emitOutputSV(dst, id, code, &cnsVal); + } + else + { + code = AddVexPrefixIfNeeded(ins, code, size); + + // In case of AVX instructions that take 3 operands, encode reg1 as first source. + // Note that reg1 is both a source and a destination. + // + // TODO-XArch-CQ: Eventually we need to support 3 operand instruction formats. For + // now we use the single source as source1 and source2. + // For this format, moves do not support a third operand, so we only need to handle the binary ops. + if (IsDstDstSrcAVXInstruction(ins)) + { + // encode source operand reg in 'vvvv' bits in 1's compliement form + code = insEncodeReg3456(ins, id->idReg1(), size, code); + } + + regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + dst = emitOutputSV(dst, id, code | regcode, &cnsVal); + } + + sz = emitSizeOfInsDsc(id); + break; + case IF_RRD_SRD: case IF_RWR_SRD: case IF_RRW_SRD: @@ -11328,6 +11717,38 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst = emitOutputCV(dst, id, insCodeMI(ins)); break; + case IF_RRW_MRD_CNS: + emitGetInsDcmCns(id, &cnsVal); + code = insCodeRM(ins); + + // Special case 4-byte AVX instructions + if (Is4ByteAVXInstruction(ins)) + { + dst = emitOutputCV(dst, id, code, &cnsVal); + } + else + { + code = AddVexPrefixIfNeeded(ins, code, size); + + // In case of AVX instructions that take 3 operands, encode reg1 as first source. + // Note that reg1 is both a source and a destination. + // + // TODO-XArch-CQ: Eventually we need to support 3 operand instruction formats. For + // now we use the single source as source1 and source2. + // For this format, moves do not support a third operand, so we only need to handle the binary ops. + if (IsDstDstSrcAVXInstruction(ins)) + { + // encode source operand reg in 'vvvv' bits in 1's compliement form + code = insEncodeReg3456(ins, id->idReg1(), size, code); + } + + regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + dst = emitOutputCV(dst, id, code | regcode | 0x0500, &cnsVal); + } + + sz = emitSizeOfInsDsc(id); + break; + case IF_RRD_MRD: case IF_RWR_MRD: case IF_RRW_MRD: diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h index c301833857..c66f57cc92 100644 --- a/src/jit/emitxarch.h +++ b/src/jit/emitxarch.h @@ -367,6 +367,12 @@ void emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg void emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir, insFormat fmt); +void emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir, int ival); + +void emitIns_R_C_I(instruction ins, emitAttr attr, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival); + +void emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival); + void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, insFormat fmt); void emitIns_R_R_C( diff --git a/src/jit/importer.cpp b/src/jit/importer.cpp index 6fd836fc28..5c7144d51d 100644 --- a/src/jit/importer.cpp +++ b/src/jit/importer.cpp @@ -19111,7 +19111,8 @@ bool Compiler::IsTargetIntrinsic(CorInfoIntrinsics intrinsicId) #if defined(_TARGET_AMD64_) || (defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)) switch (intrinsicId) { - // Amd64 only has SSE2 instruction to directly compute sqrt/abs. + // AMD64/x86 has SSE2 instructions to directly compute sqrt/abs and SSE4.1 + // instructions to directly compute round/ceiling/floor. // // TODO: Because the x86 backend only targets SSE for floating-point code, // it does not treat Sine, Cosine, or Round as intrinsics (JIT32 @@ -19123,6 +19124,11 @@ bool Compiler::IsTargetIntrinsic(CorInfoIntrinsics intrinsicId) case CORINFO_INTRINSIC_Abs: return true; + case CORINFO_INTRINSIC_Round: + case CORINFO_INTRINSIC_Ceiling: + case CORINFO_INTRINSIC_Floor: + return compSupports(InstructionSet_SSE41); + default: return false; } diff --git a/src/jit/instr.cpp b/src/jit/instr.cpp index b6d226657a..a3e354222a 100644 --- a/src/jit/instr.cpp +++ b/src/jit/instr.cpp @@ -2751,7 +2751,8 @@ void CodeGen::inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regN { #if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND) assert(ins == INS_shld || ins == INS_shrd || ins == INS_shufps || ins == INS_shufpd || ins == INS_pshufd || - ins == INS_cmpps || ins == INS_cmppd || ins == INS_dppd || ins == INS_dpps || ins == INS_insertps); + ins == INS_cmpps || ins == INS_cmppd || ins == INS_dppd || ins == INS_dpps || ins == INS_insertps || + ins == INS_roundps || ins == INS_roundss || ins == INS_roundpd || ins == INS_roundsd); #else // !_TARGET_XARCH_ assert(ins == INS_shld || ins == INS_shrd); #endif // !_TARGET_XARCH_ diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index 8d6dd6a3e8..2164e62f9c 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -363,7 +363,10 @@ INST3( pmovsxbw, "pmovsxbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( pmovsxwd, "pmovsxwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x23)) // Packed sign extend short to int INST3( pmovsxdq, "pmovsxdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x25)) // Packed sign extend int to long INST3( packusdw, "packusdw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2B)) // Pack (narrow) int to unsigned short with saturation - +INST3( roundps, "roundps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x08)) // Round packed single precision floating-point values +INST3( roundss, "roundss" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0A)) // Round scalar single precision floating-point values +INST3( roundpd, "roundpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x09)) // Round packed double precision floating-point values +INST3( roundsd, "roundsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0B)) // Round scalar double precision floating-point values INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index a2990baf63..d1ffb58ca9 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2167,7 +2167,11 @@ void Lowering::ContainCheckBoundsChk(GenTreeBoundsChk* node) void Lowering::ContainCheckIntrinsic(GenTreeOp* node) { assert(node->OperIs(GT_INTRINSIC)); - if (node->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Sqrt) + + CorInfoIntrinsics intrinsicId = node->gtIntrinsic.gtIntrinsicId; + + if (intrinsicId == CORINFO_INTRINSIC_Sqrt || intrinsicId == CORINFO_INTRINSIC_Round || + intrinsicId == CORINFO_INTRINSIC_Ceiling || intrinsicId == CORINFO_INTRINSIC_Floor) { GenTree* op1 = node->gtGetOp1(); if (IsContainableMemoryOp(op1) || op1->IsCnsNonZeroFltOrDbl()) diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index bc76e20fe9..a80c6bd8bd 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -2069,11 +2069,18 @@ void LinearScan::TreeNodeInfoInitIntrinsic(GenTree* tree, TreeNodeInfo* info) #ifdef _TARGET_X86_ case CORINFO_INTRINSIC_Cos: case CORINFO_INTRINSIC_Sin: - case CORINFO_INTRINSIC_Round: - NYI_X86("Math intrinsics Cos, Sin and Round"); + NYI_X86("Math intrinsics Cos and Sin"); break; #endif // _TARGET_X86_ + case CORINFO_INTRINSIC_Round: + case CORINFO_INTRINSIC_Ceiling: + case CORINFO_INTRINSIC_Floor: +#if defined(LEGACY_BACKEND) + NYI_X86("Math intrinsics Round, Ceiling, and Floor"); +#endif // LEGACY_BACKEND + break; + default: // Right now only Sqrt/Abs are treated as math intrinsics noway_assert(!"Unsupported math intrinsic"); diff --git a/src/jit/rationalize.cpp b/src/jit/rationalize.cpp index a5ae268207..3696f40f7c 100644 --- a/src/jit/rationalize.cpp +++ b/src/jit/rationalize.cpp @@ -823,7 +823,7 @@ Compiler::fgWalkResult Rationalizer::RewriteNode(GenTree** useEdge, ArrayStackgtIntrinsic.gtIntrinsicId)); + assert(comp->IsTargetIntrinsic(node->gtIntrinsic.gtIntrinsicId)); break; #ifdef FEATURE_SIMD @@ -999,7 +999,7 @@ void Rationalizer::DoPhase() { GenTree* const node = *use; if (node->OperGet() == GT_INTRINSIC && - Compiler::IsIntrinsicImplementedByUserCall(node->gtIntrinsic.gtIntrinsicId)) + m_rationalizer.comp->IsIntrinsicImplementedByUserCall(node->gtIntrinsic.gtIntrinsicId)) { m_rationalizer.RewriteIntrinsicAsUserCall(use, this->m_ancestors); } diff --git a/src/jit/stackfp.cpp b/src/jit/stackfp.cpp index 3aef2381c4..f35f34e9cf 100644 --- a/src/jit/stackfp.cpp +++ b/src/jit/stackfp.cpp @@ -2258,7 +2258,7 @@ void CodeGen::genCodeForTreeStackFP_SmpOp(GenTreePtr tree) } case GT_INTRINSIC: { - assert(Compiler::IsMathIntrinsic(tree)); + assert(compiler->IsMathIntrinsic(tree)); GenTreePtr op1 = tree->gtOp.gtOp1; diff --git a/src/jit/valuenum.cpp b/src/jit/valuenum.cpp index 1d50271821..7fed0e0392 100644 --- a/src/jit/valuenum.cpp +++ b/src/jit/valuenum.cpp @@ -3491,7 +3491,7 @@ ValueNum ValueNumStore::EvalMathFuncUnary(var_types typ, CorInfoIntrinsics gtMat // If the math intrinsic is not implemented by target-specific instructions, such as implemented // by user calls, then don't do constant folding on it. This minimizes precision loss. - if (IsVNConstant(arg0VN) && Compiler::IsTargetIntrinsic(gtMathFN)) + if (IsVNConstant(arg0VN) && m_pComp->IsTargetIntrinsic(gtMathFN)) { assert(varTypeIsFloating(TypeOfVN(arg0VN))); -- cgit v1.2.3