diff options
Diffstat (limited to 'src/jit/lower.cpp')
-rw-r--r-- | src/jit/lower.cpp | 742 |
1 files changed, 658 insertions, 84 deletions
diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index 09eb9146ac..a6e50b304c 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -135,6 +135,15 @@ GenTree* Lowering::LowerNode(GenTree* node) LowerCall(node); break; + case GT_LT: + case GT_LE: + case GT_GT: + case GT_GE: + case GT_EQ: + case GT_NE: + LowerCompare(node); + break; + case GT_JMP: LowerJmpMethod(node); break; @@ -169,13 +178,33 @@ GenTree* Lowering::LowerNode(GenTree* node) // produces a TYP_SIMD16 result node->gtType = TYP_SIMD16; } + +#ifdef _TARGET_XARCH_ + if ((node->AsSIMD()->gtSIMDIntrinsicID == SIMDIntrinsicGetItem) && (node->gtGetOp1()->OperGet() == GT_IND)) + { + // If SIMD vector is already in memory, we force its + // addr to be evaluated into a reg. This would allow + // us to generate [regBase] or [regBase+offset] or + // [regBase+sizeOf(SIMD vector baseType)*regIndex] + // to access the required SIMD vector element directly + // from memory. + // + // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we + // might be able update GT_LEA to fold the regIndex + // or offset in some cases. Instead with this + // approach we always evaluate GT_LEA into a reg. + // Ideally, we should be able to lower GetItem intrinsic + // into GT_IND(newAddr) where newAddr combines + // the addr of SIMD vector with the given index. + node->gtOp.gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG; + } +#endif break; case GT_LCL_VAR: case GT_STORE_LCL_VAR: if (node->TypeGet() == TYP_SIMD12) { -#ifdef _TARGET_64BIT_ // Assumption 1: // RyuJit backend depends on the assumption that on 64-Bit targets Vector3 size is rounded off // to TARGET_POINTER_SIZE and hence Vector3 locals on stack can be treated as TYP_SIMD16 for @@ -198,10 +227,29 @@ GenTree* Lowering::LowerNode(GenTree* node) // Vector3 return values are returned two return registers and Caller assembles them into a // single xmm reg. Hence RyuJIT explicitly generates code to clears upper 4-bytes of Vector3 // type args in prolog and Vector3 type return value of a call + // + // RyuJIT x86 Windows: all non-param Vector3 local vars are allocated as 16 bytes. Vector3 arguments + // are pushed as 12 bytes. For return values, a 16-byte local is allocated and the address passed + // as a return buffer pointer. The callee doesn't write the high 4 bytes, and we don't need to clear + // it either. + + unsigned varNum = node->AsLclVarCommon()->GetLclNum(); + LclVarDsc* varDsc = &comp->lvaTable[varNum]; + +#if defined(_TARGET_64BIT_) + assert(varDsc->lvSize() == 16); node->gtType = TYP_SIMD16; -#else - NYI("Lowering of TYP_SIMD12 locals"); -#endif // _TARGET_64BIT_ +#else // !_TARGET_64BIT_ + if (varDsc->lvSize() == 16) + { + node->gtType = TYP_SIMD16; + } + else + { + // The following assert is guaranteed by lvSize(). + assert(varDsc->lvIsParam); + } +#endif // !_TARGET_64BIT_ } #endif // FEATURE_SIMD __fallthrough; @@ -215,7 +263,7 @@ GenTree* Lowering::LowerNode(GenTree* node) #if FEATURE_MULTIREG_RET GenTree* src = node->gtGetOp1(); assert((src->OperGet() == GT_CALL) && src->AsCall()->HasMultiRegRetVal()); -#else // !FEATURE_MULTIREG_RET +#else // !FEATURE_MULTIREG_RET assert(!"Unexpected struct local store in Lowering"); #endif // !FEATURE_MULTIREG_RET } @@ -680,7 +728,7 @@ void Lowering::ReplaceArgWithPutArgOrCopy(GenTree** argSlot, GenTree* putArgOrCo // Arguments: // call - the call whose arg is being rewritten. // arg - the arg being rewritten. -// info - the ArgTabEntry information for the argument. +// info - the fgArgTabEntry information for the argument. // type - the type of the argument. // // Return Value: @@ -692,11 +740,11 @@ void Lowering::ReplaceArgWithPutArgOrCopy(GenTree** argSlot, GenTree* putArgOrCo // // Notes: // For System V systems with native struct passing (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined) -// this method allocates a single GT_PUTARG_REG for 1 eightbyte structs and a GT_LIST of two GT_PUTARG_REGs +// this method allocates a single GT_PUTARG_REG for 1 eightbyte structs and a GT_FIELD_LIST of two GT_PUTARG_REGs // for two eightbyte structs. // // For STK passed structs the method generates GT_PUTARG_STK tree. For System V systems with native struct passing -// (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined) this method also sets the GP pointers count and the pointers +// (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined) this method also sets the GC pointers count and the pointers // layout object, so the codegen of the GT_PUTARG_STK could use this for optimizing copying to the stack by value. // (using block copy primitives for non GC pointers and a single TARGET_POINTER_SIZE copy with recording GC info.) // @@ -753,8 +801,8 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP // In this case a new tree is created that is GT_PUTARG_REG // with a op1 the original argument. // 2. The struct is contained in 2 eightbytes: - // in this case the arg comes as a GT_LIST of two GT_LCL_FLDs - the two eightbytes of the struct. - // The code creates a GT_PUTARG_REG node for each GT_LCL_FLD in the GT_LIST + // in this case the arg comes as a GT_FIELD_LIST of two GT_LCL_FLDs - the two eightbytes of the struct. + // The code creates a GT_PUTARG_REG node for each GT_LCL_FLD in the GT_FIELD_LIST // and splices it in the list with the corresponding original GT_LCL_FLD tree as op1. assert(info->structDesc.eightByteCount != 0); @@ -826,25 +874,25 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP // // clang-format on - assert(arg->OperGet() == GT_LIST); + assert(arg->OperGet() == GT_FIELD_LIST); - GenTreeArgList* argListPtr = arg->AsArgList(); - assert(argListPtr->IsAggregate()); + GenTreeFieldList* fieldListPtr = arg->AsFieldList(); + assert(fieldListPtr->IsFieldListHead()); - for (unsigned ctr = 0; argListPtr != nullptr; argListPtr = argListPtr->Rest(), ctr++) + for (unsigned ctr = 0; fieldListPtr != nullptr; fieldListPtr = fieldListPtr->Rest(), ctr++) { // Create a new GT_PUTARG_REG node with op1 the original GT_LCL_FLD. GenTreePtr newOper = comp->gtNewOperNode( GT_PUTARG_REG, comp->GetTypeFromClassificationAndSizes(info->structDesc.eightByteClassifications[ctr], info->structDesc.eightByteSizes[ctr]), - argListPtr->gtOp.gtOp1); + fieldListPtr->gtOp.gtOp1); - // Splice in the new GT_PUTARG_REG node in the GT_LIST - ReplaceArgWithPutArgOrCopy(&argListPtr->gtOp.gtOp1, newOper); + // Splice in the new GT_PUTARG_REG node in the GT_FIELD_LIST + ReplaceArgWithPutArgOrCopy(&fieldListPtr->gtOp.gtOp1, newOper); } - // Just return arg. The GT_LIST is not replaced. + // Just return arg. The GT_FIELD_LIST is not replaced. // Nothing more to do. return arg; } @@ -857,26 +905,26 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP else #else // not defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) #if FEATURE_MULTIREG_ARGS - if ((info->numRegs > 1) && (arg->OperGet() == GT_LIST)) + if ((info->numRegs > 1) && (arg->OperGet() == GT_FIELD_LIST)) { - assert(arg->OperGet() == GT_LIST); + assert(arg->OperGet() == GT_FIELD_LIST); - GenTreeArgList* argListPtr = arg->AsArgList(); - assert(argListPtr->IsAggregate()); + GenTreeFieldList* fieldListPtr = arg->AsFieldList(); + assert(fieldListPtr->IsFieldListHead()); - for (unsigned ctr = 0; argListPtr != nullptr; argListPtr = argListPtr->Rest(), ctr++) + for (unsigned ctr = 0; fieldListPtr != nullptr; fieldListPtr = fieldListPtr->Rest(), ctr++) { - GenTreePtr curOp = argListPtr->gtOp.gtOp1; + GenTreePtr curOp = fieldListPtr->gtOp.gtOp1; var_types curTyp = curOp->TypeGet(); // Create a new GT_PUTARG_REG node with op1 GenTreePtr newOper = comp->gtNewOperNode(GT_PUTARG_REG, curTyp, curOp); - // Splice in the new GT_PUTARG_REG node in the GT_LIST - ReplaceArgWithPutArgOrCopy(&argListPtr->gtOp.gtOp1, newOper); + // Splice in the new GT_PUTARG_REG node in the GT_FIELD_LIST + ReplaceArgWithPutArgOrCopy(&fieldListPtr->gtOp.gtOp1, newOper); } - // Just return arg. The GT_LIST is not replaced. + // Just return arg. The GT_FIELD_LIST is not replaced. // Nothing more to do. return arg; } @@ -893,23 +941,20 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP // This provides the info to put this argument in in-coming arg area slot // instead of in out-going arg area slot. - FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(assert(info->isStruct == varTypeIsStruct(type))); // Make sure state is - // correct + PUT_STRUCT_ARG_STK_ONLY(assert(info->isStruct == varTypeIsStruct(type))); // Make sure state is + // correct #if FEATURE_FASTTAILCALL putArg = new (comp, GT_PUTARG_STK) - GenTreePutArgStk(GT_PUTARG_STK, type, arg, - info->slotNum FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(info->numSlots) - FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(info->isStruct), + GenTreePutArgStk(GT_PUTARG_STK, type, arg, info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots), call->IsFastTailCall() DEBUGARG(call)); #else putArg = new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, type, arg, - info->slotNum FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(info->numSlots) - FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(info->isStruct) DEBUGARG(call)); + info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots) DEBUGARG(call)); #endif -#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING +#ifdef FEATURE_PUT_STRUCT_ARG_STK // If the ArgTabEntry indicates that this arg is a struct // get and store the number of slots that are references. // This is later used in the codegen for PUT_ARG_STK implementation @@ -919,8 +964,6 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP // pair copying using XMM registers or rep mov instructions. if (info->isStruct) { - unsigned numRefs = 0; - BYTE* gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots]; // We use GT_OBJ for non-SIMD struct arguments. However, for // SIMD arguments the GT_OBJ has already been transformed. if (arg->gtOper != GT_OBJ) @@ -929,13 +972,14 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP } else { + unsigned numRefs = 0; + BYTE* gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots]; assert(!varTypeIsSIMD(arg)); numRefs = comp->info.compCompHnd->getClassGClayout(arg->gtObj.gtClass, gcLayout); + putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout); } - - putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout); } -#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING +#endif // FEATURE_PUT_STRUCT_ARG_STK } if (arg->InReg()) @@ -1011,6 +1055,22 @@ void Lowering::LowerArg(GenTreeCall* call, GenTreePtr* ppArg) type = TYP_INT; } +#if defined(FEATURE_SIMD) && defined(_TARGET_X86_) + // Non-param TYP_SIMD12 local var nodes are massaged in Lower to TYP_SIMD16 to match their + // allocated size (see lvSize()). However, when passing the variables as arguments, and + // storing the variables to the outgoing argument area on the stack, we must use their + // actual TYP_SIMD12 type, so exactly 12 bytes is allocated and written. + if (type == TYP_SIMD16) + { + if ((arg->OperGet() == GT_LCL_VAR) || (arg->OperGet() == GT_STORE_LCL_VAR)) + { + unsigned varNum = arg->AsLclVarCommon()->GetLclNum(); + LclVarDsc* varDsc = &comp->lvaTable[varNum]; + type = varDsc->lvType; + } + } +#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_) + GenTreePtr putArg; // If we hit this we are probably double-lowering. @@ -1068,7 +1128,7 @@ void Lowering::LowerArg(GenTreeCall* call, GenTreePtr* ppArg) putArg = NewPutArg(call, arg, info, type); // In the case of register passable struct (in one or two registers) - // the NewPutArg returns a new node (GT_PUTARG_REG or a GT_LIST with two GT_PUTARG_REGs.) + // the NewPutArg returns a new node (GT_PUTARG_REG or a GT_FIELD_LIST with two GT_PUTARG_REGs.) // If an extra node is returned, splice it in the right place in the tree. if (arg != putArg) { @@ -1367,6 +1427,7 @@ void Lowering::CheckVSQuirkStackPaddingNeeded(GenTreeCall* call) // Inserts profiler hook, GT_PROF_HOOK for a tail call node. // +// AMD64: // We need to insert this after all nested calls, but before all the arguments to this call have been set up. // To do this, we look for the first GT_PUTARG_STK or GT_PUTARG_REG, and insert the hook immediately before // that. If there are no args, then it should be inserted before the call node. @@ -1391,16 +1452,30 @@ void Lowering::CheckVSQuirkStackPaddingNeeded(GenTreeCall* call) // In this case, the GT_PUTARG_REG src is a nested call. We need to put the instructions after that call // (as shown). We assume that of all the GT_PUTARG_*, only the first one can have a nested call. // +// X86: +// Insert the profiler hook immediately before the call. The profiler hook will preserve +// all argument registers (ECX, EDX), but nothing else. +// // Params: // callNode - tail call node -// insertionPoint - if caller has an insertion point; If null -// profiler hook is inserted before args are setup +// insertionPoint - if non-null, insert the profiler hook before this point. +// If null, insert the profiler hook before args are setup // but after all arg side effects are computed. +// void Lowering::InsertProfTailCallHook(GenTreeCall* call, GenTree* insertionPoint) { assert(call->IsTailCall()); assert(comp->compIsProfilerHookNeeded()); +#if defined(_TARGET_X86_) + + if (insertionPoint == nullptr) + { + insertionPoint = call; + } + +#else // !defined(_TARGET_X86_) + if (insertionPoint == nullptr) { GenTreePtr tmp = nullptr; @@ -1437,6 +1512,8 @@ void Lowering::InsertProfTailCallHook(GenTreeCall* call, GenTree* insertionPoint } } +#endif // !defined(_TARGET_X86_) + assert(insertionPoint != nullptr); GenTreePtr profHookNode = new (comp, GT_PROF_HOOK) GenTree(GT_PROF_HOOK, TYP_VOID); BlockRange().InsertBefore(insertionPoint, profHookNode); @@ -1705,7 +1782,10 @@ GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget assert(!comp->opts.compNeedSecurityCheck); // tail call from methods that need security check assert(!call->IsUnmanaged()); // tail calls to unamanaged methods assert(!comp->compLocallocUsed); // tail call from methods that also do localloc - assert(!comp->getNeedsGSSecurityCookie()); // jit64 compat: tail calls from methods that need GS check + +#ifdef _TARGET_AMD64_ + assert(!comp->getNeedsGSSecurityCookie()); // jit64 compat: tail calls from methods that need GS check +#endif // _TARGET_AMD64_ // We expect to see a call that meets the following conditions assert(call->IsTailCallViaHelper()); @@ -1713,8 +1793,9 @@ GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget // The TailCall helper call never returns to the caller and is not GC interruptible. // Therefore the block containing the tail call should be a GC safe point to avoid - // GC starvation. - assert(comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT); + // GC starvation. It is legal for the block to be unmarked iff the entry block is a + // GC safe point, as the entry block trivially dominates every reachable block. + assert((comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT) || (comp->fgFirstBB->bbFlags & BBF_GC_SAFE_POINT)); // If PInvokes are in-lined, we have to remember to execute PInvoke method epilog anywhere that // a method returns. This is a case of caller method has both PInvokes and tail calls. @@ -1839,16 +1920,268 @@ GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget // Now add back tail call flags for identifying this node as tail call dispatched via helper. call->gtCallMoreFlags |= GTF_CALL_M_TAILCALL | GTF_CALL_M_TAILCALL_VIA_HELPER; +#ifdef PROFILING_SUPPORTED // Insert profiler tail call hook if needed. // Since we don't know the insertion point, pass null for second param. if (comp->compIsProfilerHookNeeded()) { InsertProfTailCallHook(call, nullptr); } +#endif // PROFILING_SUPPORTED + + assert(call->IsTailCallViaHelper()); return result; } +//------------------------------------------------------------------------ +// Lowering::LowerCompare: lowers a compare node. +// +// For 64-bit targets, this doesn't do much of anything: all comparisons +// that we support can be handled in code generation on such targets. +// +// For 32-bit targets, however, any comparison that feeds a `GT_JTRUE` +// node must be lowered such that the liveness of the operands to the +// comparison is properly visible to the rest of the backend. As such, +// a 64-bit comparison is lowered from something like this: +// +// ------------ BB02 [004..014) -> BB02 (cond), preds={BB02,BB01} succs={BB03,BB02} +// N001 ( 1, 1) [000006] ------------ t6 = lclVar int V02 loc0 u:5 $148 +// +// /--* t6 int +// N002 ( 2, 3) [000007] ---------U-- t7 = * cast long <- ulong <- uint $3c0 +// +// N003 ( 3, 10) [000009] ------------ t9 = lconst long 0x0000000000000003 $101 +// +// /--* t7 long +// +--* t9 long +// N004 ( 9, 17) [000010] N------N-U-- t10 = * < int $149 +// +// /--* t10 int +// N005 ( 11, 19) [000011] ------------ * jmpTrue void +// +// To something like this: +// +// ------------ BB02 [004..014) -> BB03 (cond), preds={BB06,BB07,BB01} succs={BB06,BB03} +// [000099] ------------ t99 = const int 0 +// +// [000101] ------------ t101 = const int 0 +// +// /--* t99 int +// +--* t101 int +// N004 ( 9, 17) [000010] N------N-U-- t10 = * > int $149 +// +// /--* t10 int +// N005 ( 11, 19) [000011] ------------ * jmpTrue void +// +// +// ------------ BB06 [???..???) -> BB02 (cond), preds={BB02} succs={BB07,BB02} +// [000105] -------N-U-- jcc void cond=< +// +// +// ------------ BB07 [???..???) -> BB02 (cond), preds={BB06} succs={BB03,BB02} +// N001 ( 1, 1) [000006] ------------ t6 = lclVar int V02 loc0 u:5 $148 +// +// N003 ( 3, 10) [000009] ------------ t9 = const int 3 +// +// /--* t6 int +// +--* t9 int +// [000106] N------N-U-- t106 = * < int +// +// /--* t106 int +// [000107] ------------ * jmpTrue void +// +// Which will eventually generate code similar to the following: +// +// 33DB xor ebx, ebx +// 85DB test ebx, ebx +// 7707 ja SHORT G_M50523_IG04 +// 72E7 jb SHORT G_M50523_IG03 +// 83F803 cmp eax, 3 +// 72E2 jb SHORT G_M50523_IG03 +// +void Lowering::LowerCompare(GenTree* cmp) +{ +#ifndef _TARGET_64BIT_ + if (cmp->gtGetOp1()->TypeGet() != TYP_LONG) + { + return; + } + + LIR::Use cmpUse; + + if (!BlockRange().TryGetUse(cmp, &cmpUse) || cmpUse.User()->OperGet() != GT_JTRUE) + { + return; + } + + GenTree* src1 = cmp->gtGetOp1(); + GenTree* src2 = cmp->gtGetOp2(); + unsigned weight = m_block->getBBWeight(comp); + + LIR::Use loSrc1(BlockRange(), &(src1->gtOp.gtOp1), src1); + LIR::Use loSrc2(BlockRange(), &(src2->gtOp.gtOp1), src2); + + if (loSrc1.Def()->OperGet() != GT_CNS_INT && loSrc1.Def()->OperGet() != GT_LCL_VAR) + { + loSrc1.ReplaceWithLclVar(comp, weight); + } + + if (loSrc2.Def()->OperGet() != GT_CNS_INT && loSrc2.Def()->OperGet() != GT_LCL_VAR) + { + loSrc2.ReplaceWithLclVar(comp, weight); + } + + BasicBlock* jumpDest = m_block->bbJumpDest; + BasicBlock* nextDest = m_block->bbNext; + BasicBlock* newBlock = comp->fgSplitBlockAtEnd(m_block); + + cmp->gtType = TYP_INT; + cmp->gtOp.gtOp1 = src1->gtOp.gtOp2; + cmp->gtOp.gtOp2 = src2->gtOp.gtOp2; + + if (cmp->OperGet() == GT_EQ || cmp->OperGet() == GT_NE) + { + // 64-bit equality comparisons (no matter the polarity) require two 32-bit comparisons: one for the upper 32 + // bits and one for the lower 32 bits. As such, we update the flow graph like so: + // + // Before: + // BB0: cond + // / \ + // false true + // | | + // BB1 BB2 + // + // After: + // BB0: cond(hi) + // / \ + // false true + // | | + // | BB3: cond(lo) + // | / \ + // | false true + // \ / | + // BB1 BB2 + // + + BlockRange().Remove(loSrc1.Def()); + BlockRange().Remove(loSrc2.Def()); + GenTree* loCmp = comp->gtNewOperNode(cmp->OperGet(), TYP_INT, loSrc1.Def(), loSrc2.Def()); + loCmp->gtFlags = cmp->gtFlags; + GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp); + LIR::AsRange(newBlock).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue); + + m_block->bbJumpKind = BBJ_COND; + + if (cmp->OperGet() == GT_EQ) + { + cmp->gtOper = GT_NE; + m_block->bbJumpDest = nextDest; + nextDest->bbFlags |= BBF_JMP_TARGET; + comp->fgAddRefPred(nextDest, m_block); + } + else + { + m_block->bbJumpDest = jumpDest; + comp->fgAddRefPred(jumpDest, m_block); + } + + assert(newBlock->bbJumpKind == BBJ_COND); + assert(newBlock->bbJumpDest == jumpDest); + } + else + { + // 64-bit ordinal comparisons are more complicated: they require two comparisons for the upper 32 bits and one + // comparison for the lower 32 bits. We update the flowgraph as such: + // + // Before: + // BB0: cond + // / \ + // false true + // | | + // BB1 BB2 + // + // After: + // BB0: (!cond(hi) && !eq(hi)) + // / \ + // true false + // | | + // | BB3: (cond(hi) && !eq(hi)) + // | / \ + // | false true + // | | | + // | BB4: cond(lo) | + // | / \ | + // | false true | + // \ / \ / + // BB1 BB2 + // + // + // Note that the actual comparisons used to implement "(!cond(hi) && !eq(hi))" and "(cond(hi) && !eq(hi))" + // differ based on the original condition, and all consist of a single node. The switch statement below + // performs the necessary mapping. + // + + genTreeOps hiCmpOper; + genTreeOps loCmpOper; + + switch (cmp->OperGet()) + { + case GT_LT: + cmp->gtOper = GT_GT; + hiCmpOper = GT_LT; + loCmpOper = GT_LT; + break; + case GT_LE: + cmp->gtOper = GT_GT; + hiCmpOper = GT_LT; + loCmpOper = GT_LE; + break; + case GT_GT: + cmp->gtOper = GT_LT; + hiCmpOper = GT_GT; + loCmpOper = GT_GT; + break; + case GT_GE: + cmp->gtOper = GT_LT; + hiCmpOper = GT_GT; + loCmpOper = GT_GE; + break; + default: + unreached(); + } + + BasicBlock* newBlock2 = comp->fgSplitBlockAtEnd(newBlock); + + GenTree* hiJcc = new (comp, GT_JCC) GenTreeJumpCC(hiCmpOper); + hiJcc->gtFlags = cmp->gtFlags; + LIR::AsRange(newBlock).InsertAfter(nullptr, hiJcc); + + BlockRange().Remove(loSrc1.Def()); + BlockRange().Remove(loSrc2.Def()); + GenTree* loCmp = comp->gtNewOperNode(loCmpOper, TYP_INT, loSrc1.Def(), loSrc2.Def()); + loCmp->gtFlags = cmp->gtFlags | GTF_UNSIGNED; + GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp); + LIR::AsRange(newBlock2).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue); + + m_block->bbJumpKind = BBJ_COND; + m_block->bbJumpDest = nextDest; + nextDest->bbFlags |= BBF_JMP_TARGET; + comp->fgAddRefPred(nextDest, m_block); + + newBlock->bbJumpKind = BBJ_COND; + newBlock->bbJumpDest = jumpDest; + comp->fgAddRefPred(jumpDest, newBlock); + + assert(newBlock2->bbJumpKind == BBJ_COND); + assert(newBlock2->bbJumpDest == jumpDest); + } + + BlockRange().Remove(src1); + BlockRange().Remove(src2); +#endif +} + // Lower "jmp <method>" tail call to insert PInvoke method epilog if required. void Lowering::LowerJmpMethod(GenTree* jmp) { @@ -2334,8 +2667,12 @@ void Lowering::InsertPInvokeMethodProlog() DISPTREERANGE(firstBlockRange, storeFP); // -------------------------------------------------------- + // On 32-bit targets, CORINFO_HELP_INIT_PINVOKE_FRAME initializes the PInvoke frame and then pushes it onto + // the current thread's Frame stack. On 64-bit targets, it only initializes the PInvoke frame. + CLANG_FORMAT_COMMENT_ANCHOR; - if (comp->opts.eeFlags & CORJIT_FLG_IL_STUB) +#ifdef _TARGET_64BIT_ + if (comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB)) { // Push a frame - if we are NOT in an IL stub, this is done right before the call // The init routine sets InlinedCallFrame's m_pNext, so we just set the thead's top-of-stack @@ -2343,6 +2680,7 @@ void Lowering::InsertPInvokeMethodProlog() firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, frameUpd)); DISPTREERANGE(firstBlockRange, frameUpd); } +#endif // _TARGET_64BIT_ } //------------------------------------------------------------------------ @@ -2405,9 +2743,14 @@ void Lowering::InsertPInvokeMethodEpilog(BasicBlock* returnBB DEBUGARG(GenTreePt GenTree* storeGCState = SetGCState(1); returnBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, storeGCState)); - if (comp->opts.eeFlags & CORJIT_FLG_IL_STUB) + // Pop the frame if necessary. This always happens in the epilog on 32-bit targets. For 64-bit targets, we only do + // this in the epilog for IL stubs; for non-IL stubs the frame is popped after every PInvoke call. + CLANG_FORMAT_COMMENT_ANCHOR; + +#ifdef _TARGET_64BIT_ + if (comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB)) +#endif // _TARGET_64BIT_ { - // Pop the frame, in non-stubs we do this around each PInvoke call GenTree* frameUpd = CreateFrameLinkUpdate(PopFrame); returnBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, frameUpd)); } @@ -2454,6 +2797,7 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call) comp->fgMorphTree(helperCall); BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, helperCall)); + LowerNode(helperCall); // helper call is inserted before current node and should be lowered here. return; } #endif @@ -2464,7 +2808,7 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call) // InlinedCallFrame.m_pCallSiteSP = SP // x86 only // InlinedCallFrame.m_pCallerReturnAddress = return address // Thread.gcState = 0 - // (non-stub) - update top Frame on TCB + // (non-stub) - update top Frame on TCB // 64-bit targets only // ---------------------------------------------------------------------------------- // Setup InlinedCallFrame.callSiteTarget (which is how the JIT refers to it). @@ -2474,11 +2818,19 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call) if (callType == CT_INDIRECT) { +#if !defined(_TARGET_64BIT_) + // On 32-bit targets, indirect calls need the size of the stack args in InlinedCallFrame.m_Datum. + const unsigned numStkArgBytes = call->fgArgInfo->GetNextSlotNum() * TARGET_POINTER_SIZE; + + src = comp->gtNewIconNode(numStkArgBytes, TYP_INT); +#else + // On 64-bit targets, indirect calls may need the stub parameter value in InlinedCallFrame.m_Datum. + // If the stub parameter value is not needed, m_Datum will be initialized by the VM. if (comp->info.compPublishStubParam) { - src = new (comp, GT_LCL_VAR) GenTreeLclVar(TYP_I_IMPL, comp->lvaStubArgumentVar, BAD_IL_OFFSET); + src = comp->gtNewLclvNode(comp->lvaStubArgumentVar, TYP_I_IMPL); } - // else { If we don't have secret parameter, m_Datum will be initialized by VM code } +#endif // !defined(_TARGET_64BIT_) } else { @@ -2542,7 +2894,12 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call) BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, storeLab)); - if (!(comp->opts.eeFlags & CORJIT_FLG_IL_STUB)) + // Push the PInvoke frame if necessary. On 32-bit targets this only happens in the method prolog if a method + // contains PInvokes; on 64-bit targets this is necessary in non-stubs. + CLANG_FORMAT_COMMENT_ANCHOR; + +#ifdef _TARGET_64BIT_ + if (!comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB)) { // Set the TCB's frame to be the one we just created. // Note the init routine for the InlinedCallFrame (CORINFO_HELP_INIT_PINVOKE_FRAME) @@ -2552,6 +2909,7 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call) GenTree* frameUpd = CreateFrameLinkUpdate(PushFrame); BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, frameUpd)); } +#endif // _TARGET_64BIT_ // IMPORTANT **** This instruction must come last!!! **** // It changes the thread's state to Preemptive mode @@ -2583,7 +2941,7 @@ void Lowering::InsertPInvokeCallEpilog(GenTreeCall* call) // First argument is the address of the frame variable. GenTree* frameAddr = new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, BAD_IL_OFFSET); - frameAddr->gtOper = GT_LCL_VAR_ADDR; + frameAddr->SetOperRaw(GT_LCL_VAR_ADDR); // Insert call to CORINFO_HELP_JIT_PINVOKE_END GenTree* helperCall = @@ -2604,12 +2962,32 @@ void Lowering::InsertPInvokeCallEpilog(GenTreeCall* call) tree = CreateReturnTrapSeq(); BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree)); - // Pop the frame if necessasry - if (!(comp->opts.eeFlags & CORJIT_FLG_IL_STUB)) + // Pop the frame if necessary. On 32-bit targets this only happens in the method epilog; on 64-bit targets thi + // happens after every PInvoke call in non-stubs. 32-bit targets instead mark the frame as inactive. + CLANG_FORMAT_COMMENT_ANCHOR; + +#ifdef _TARGET_64BIT_ + if (!comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB)) { tree = CreateFrameLinkUpdate(PopFrame); BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree)); } +#else + const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = comp->eeGetEEInfo()->inlinedCallFrameInfo; + + // ---------------------------------------------------------------------------------- + // InlinedCallFrame.m_pCallerReturnAddress = nullptr + + GenTreeLclFld* const storeCallSiteTracker = + new (comp, GT_STORE_LCL_FLD) GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar, + callFrameInfo.offsetOfReturnAddress); + + GenTreeIntCon* const constantZero = new (comp, GT_CNS_INT) GenTreeIntCon(TYP_I_IMPL, 0); + + storeCallSiteTracker->gtOp1 = constantZero; + + BlockRange().InsertBefore(insertionPoint, constantZero, storeCallSiteTracker); +#endif // _TARGET_64BIT_ } //------------------------------------------------------------------------ @@ -2624,7 +3002,7 @@ void Lowering::InsertPInvokeCallEpilog(GenTreeCall* call) GenTree* Lowering::LowerNonvirtPinvokeCall(GenTreeCall* call) { // PInvoke lowering varies depending on the flags passed in by the EE. By default, - // GC transitions are generated inline; if CORJIT_FLG2_USE_PINVOKE_HELPERS is specified, + // GC transitions are generated inline; if CORJIT_FLAG_USE_PINVOKE_HELPERS is specified, // GC transitions are instead performed using helper calls. Examples of each case are given // below. Note that the data structure that is used to store information about a call frame // containing any P/Invoke calls is initialized in the method prolog (see @@ -2697,7 +3075,7 @@ GenTree* Lowering::LowerNonvirtPinvokeCall(GenTreeCall* call) #if COR_JIT_EE_VERSION > 460 comp->info.compCompHnd->getAddressOfPInvokeTarget(methHnd, &lookup); #else - void* pIndirection; + void* pIndirection; lookup.accessType = IAT_PVALUE; lookup.addr = comp->info.compCompHnd->getAddressOfPInvokeFixup(methHnd, &pIndirection); if (lookup.addr == nullptr) @@ -2866,14 +3244,10 @@ GenTree* Lowering::LowerVirtualStubCall(GenTreeCall* call) } #endif - // TODO-Cleanup: Disable emitting random NOPs - // This is code to set up an indirect call to a stub address computed // via dictionary lookup. if (call->gtCallType == CT_INDIRECT) { - NYI_X86("Virtual Stub dispatched call lowering via dictionary lookup"); - // The importer decided we needed a stub call via a computed // stub dispatch address, i.e. an address which came from a dictionary lookup. // - The dictionary lookup produces an indirected address, suitable for call @@ -2886,6 +3260,8 @@ GenTree* Lowering::LowerVirtualStubCall(GenTreeCall* call) // All we have to do here is add an indirection to generate the actual call target. GenTree* ind = Ind(call->gtCallAddr); + ind->gtFlags |= GTF_IND_REQ_ADDR_IN_REG; + BlockRange().InsertAfter(call->gtCallAddr, ind); call->gtCallAddr = ind; } @@ -2923,8 +3299,10 @@ GenTree* Lowering::LowerVirtualStubCall(GenTreeCall* call) // So we don't use a register. #ifndef _TARGET_X86_ // on x64 we must materialize the target using specific registers. - addr->gtRegNum = REG_VIRTUAL_STUB_PARAM; + addr->gtRegNum = REG_VIRTUAL_STUB_PARAM; + indir->gtRegNum = REG_JUMP_THUNK_PARAM; + indir->gtFlags |= GTF_IND_REQ_ADDR_IN_REG; #endif result = indir; } @@ -3042,8 +3420,6 @@ bool Lowering::AreSourcesPossiblyModifiedLocals(GenTree* addr, GenTree* base, Ge return true; } } - - unreached(); } //------------------------------------------------------------------------ @@ -3082,9 +3458,9 @@ GenTree* Lowering::TryCreateAddrMode(LIR::Use&& use, bool isIndir) { // We can have an indirection on the rhs of a block copy (it is the source // object). This is not a "regular" indirection. - // (Note that the parent check could be costly.) - GenTree* parent = indir->gtGetParent(nullptr); - if ((parent != nullptr) && parent->OperIsIndir()) + // (Note that the user check could be costly.) + LIR::Use indirUse; + if (BlockRange().TryGetUse(indir, &indirUse) && indirUse.User()->OperIsIndir()) { isIndir = false; } @@ -3248,9 +3624,14 @@ void Lowering::LowerUnsignedDivOrMod(GenTree* node) { assert((node->OperGet() == GT_UDIV) || (node->OperGet() == GT_UMOD)); - GenTree* divisor = node->gtGetOp2(); + GenTree* divisor = node->gtGetOp2(); + GenTree* dividend = node->gtGetOp1(); - if (divisor->IsCnsIntOrI()) + if (divisor->IsCnsIntOrI() +#ifdef _TARGET_X86_ + && (dividend->OperGet() != GT_LONG) +#endif + ) { size_t divisorValue = static_cast<size_t>(divisor->gtIntCon.IconValue()); @@ -3276,6 +3657,91 @@ void Lowering::LowerUnsignedDivOrMod(GenTree* node) } //------------------------------------------------------------------------ +// GetSignedMagicNumberForDivide: Generates a magic number and shift amount for +// the magic number division optimization. +// +// Arguments: +// denom - The denominator +// shift - Pointer to the shift value to be returned +// +// Returns: +// The magic number. +// +// Notes: +// This code is previously from UTC where it notes it was taken from +// _The_PowerPC_Compiler_Writer's_Guide_, pages 57-58. The paper is is based on +// is "Division by invariant integers using multiplication" by Torbjorn Granlund +// and Peter L. Montgomery in PLDI 94 + +template <typename T> +T GetSignedMagicNumberForDivide(T denom, int* shift /*out*/) +{ + // static SMAG smag; + const int bits = sizeof(T) * 8; + const int bits_minus_1 = bits - 1; + + typedef typename jitstd::make_unsigned<T>::type UT; + + const UT two_nminus1 = UT(1) << bits_minus_1; + + int p; + UT absDenom; + UT absNc; + UT delta; + UT q1; + UT r1; + UT r2; + UT q2; + UT t; + T result_magic; + int result_shift; + int iters = 0; + + absDenom = abs(denom); + t = two_nminus1 + ((unsigned int)denom >> 31); + absNc = t - 1 - (t % absDenom); // absolute value of nc + p = bits_minus_1; // initialize p + q1 = two_nminus1 / absNc; // initialize q1 = 2^p / abs(nc) + r1 = two_nminus1 - (q1 * absNc); // initialize r1 = rem(2^p, abs(nc)) + q2 = two_nminus1 / absDenom; // initialize q1 = 2^p / abs(denom) + r2 = two_nminus1 - (q2 * absDenom); // initialize r1 = rem(2^p, abs(denom)) + + do + { + iters++; + p++; + q1 *= 2; // update q1 = 2^p / abs(nc) + r1 *= 2; // update r1 = rem(2^p / abs(nc)) + + if (r1 >= absNc) + { // must be unsigned comparison + q1++; + r1 -= absNc; + } + + q2 *= 2; // update q2 = 2^p / abs(denom) + r2 *= 2; // update r2 = rem(2^p / abs(denom)) + + if (r2 >= absDenom) + { // must be unsigned comparison + q2++; + r2 -= absDenom; + } + + delta = absDenom - r2; + } while (q1 < delta || (q1 == delta && r1 == 0)); + + result_magic = q2 + 1; // resulting magic number + if (denom < 0) + { + result_magic = -result_magic; + } + *shift = p - bits; // resulting shift + + return result_magic; +} + +//------------------------------------------------------------------------ // LowerSignedDivOrMod: transform integer GT_DIV/GT_MOD nodes with a power of 2 // const divisor into equivalent but faster sequences. // @@ -3313,8 +3779,10 @@ GenTree* Lowering::LowerSignedDivOrMod(GenTreePtr node) ssize_t divisorValue = divisor->gtIntCon.IconValue(); - if (divisorValue == -1) + if (divisorValue == -1 || divisorValue == 0) { + // x / 0 and x % 0 can't be optimized because they are required to throw an exception. + // x / -1 can't be optimized because INT_MIN / -1 is required to throw an exception. // x % -1 is always 0 and the IL spec says that the rem instruction "can" throw an exception if x is @@ -3343,14 +3811,122 @@ GenTree* Lowering::LowerSignedDivOrMod(GenTreePtr node) if (!isPow2(absDivisorValue)) { +#ifdef _TARGET_XARCH_ + ssize_t magic; + int shift; + + if (type == TYP_INT) + { + magic = GetSignedMagicNumberForDivide<int32_t>(static_cast<int32_t>(divisorValue), &shift); + } + else + { +#ifdef _TARGET_64BIT_ + magic = GetSignedMagicNumberForDivide<int64_t>(static_cast<int64_t>(divisorValue), &shift); +#else + unreached(); +#endif + } + + divisor->gtIntConCommon.SetIconValue(magic); + + // Insert a new GT_MULHI node in front of the existing GT_DIV/GT_MOD node. + // The existing node will later be transformed into a GT_ADD/GT_SUB that + // computes the final result. This way don't need to find and change the + // use of the existing node. + GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, divisor, dividend); + BlockRange().InsertBefore(divMod, mulhi); + + // mulhi was the easy part. Now we need to generate different code depending + // on the divisor value: + // For 3 we need: + // div = signbit(mulhi) + mulhi + // For 5 we need: + // div = signbit(mulhi) + sar(mulhi, 1) ; requires shift adjust + // For 7 we need: + // mulhi += dividend ; requires add adjust + // div = signbit(mulhi) + sar(mulhi, 2) ; requires shift adjust + // For -3 we need: + // mulhi -= dividend ; requires sub adjust + // div = signbit(mulhi) + sar(mulhi, 1) ; requires shift adjust + bool requiresAddSubAdjust = signum(divisorValue) != signum(magic); + bool requiresShiftAdjust = shift != 0; + bool requiresDividendMultiuse = requiresAddSubAdjust || !isDiv; + unsigned curBBWeight = comp->compCurBB->getBBWeight(comp); + unsigned dividendLclNum = BAD_VAR_NUM; + + if (requiresDividendMultiuse) + { + LIR::Use dividendUse(BlockRange(), &mulhi->gtOp.gtOp2, mulhi); + dividendLclNum = dividendUse.ReplaceWithLclVar(comp, curBBWeight); + } + + GenTree* adjusted; + + if (requiresAddSubAdjust) + { + dividend = comp->gtNewLclvNode(dividendLclNum, type); + comp->lvaTable[dividendLclNum].incRefCnts(curBBWeight, comp); + + adjusted = comp->gtNewOperNode(divisorValue > 0 ? GT_ADD : GT_SUB, type, mulhi, dividend); + BlockRange().InsertBefore(divMod, dividend, adjusted); + } + else + { + adjusted = mulhi; + } + + GenTree* shiftBy = comp->gtNewIconNode(genTypeSize(type) * 8 - 1, type); + GenTree* signBit = comp->gtNewOperNode(GT_RSZ, type, adjusted, shiftBy); + BlockRange().InsertBefore(divMod, shiftBy, signBit); + + LIR::Use adjustedUse(BlockRange(), &signBit->gtOp.gtOp1, signBit); + unsigned adjustedLclNum = adjustedUse.ReplaceWithLclVar(comp, curBBWeight); + adjusted = comp->gtNewLclvNode(adjustedLclNum, type); + comp->lvaTable[adjustedLclNum].incRefCnts(curBBWeight, comp); + BlockRange().InsertBefore(divMod, adjusted); + + if (requiresShiftAdjust) + { + shiftBy = comp->gtNewIconNode(shift, TYP_INT); + adjusted = comp->gtNewOperNode(GT_RSH, type, adjusted, shiftBy); + BlockRange().InsertBefore(divMod, shiftBy, adjusted); + } + + if (isDiv) + { + divMod->SetOperRaw(GT_ADD); + divMod->gtOp.gtOp1 = adjusted; + divMod->gtOp.gtOp2 = signBit; + } + else + { + GenTree* div = comp->gtNewOperNode(GT_ADD, type, adjusted, signBit); + + dividend = comp->gtNewLclvNode(dividendLclNum, type); + comp->lvaTable[dividendLclNum].incRefCnts(curBBWeight, comp); + + // divisor % dividend = dividend - divisor x div + GenTree* divisor = comp->gtNewIconNode(divisorValue, type); + GenTree* mul = comp->gtNewOperNode(GT_MUL, type, div, divisor); + BlockRange().InsertBefore(divMod, dividend, div, divisor, mul); + + divMod->SetOperRaw(GT_SUB); + divMod->gtOp.gtOp1 = dividend; + divMod->gtOp.gtOp2 = mul; + } + + return mulhi; +#else + // Currently there's no GT_MULHI for ARM32/64 return next; +#endif } - // We're committed to the conversion now. Go find the use. + // We're committed to the conversion now. Go find the use if any. LIR::Use use; if (!BlockRange().TryGetUse(node, &use)) { - assert(!"signed DIV/MOD node is unused"); return next; } @@ -3450,8 +4026,6 @@ void Lowering::LowerStoreInd(GenTree* node) void Lowering::LowerBlockStore(GenTreeBlk* blkNode) { GenTree* src = blkNode->Data(); - // TODO-1stClassStructs: Don't require this. - assert(blkNode->OperIsInitBlkOp() || !src->OperIsLocal()); TryCreateAddrMode(LIR::Use(BlockRange(), &blkNode->Addr(), blkNode), false); } @@ -3817,17 +4391,17 @@ void Lowering::CheckCallArg(GenTree* arg) break; #endif - case GT_LIST: - { - GenTreeArgList* list = arg->AsArgList(); - assert(list->IsAggregate()); + case GT_FIELD_LIST: + { + GenTreeFieldList* list = arg->AsFieldList(); + assert(list->IsFieldListHead()); - for (; list != nullptr; list = list->Rest()) - { - assert(list->Current()->OperIsPutArg()); - } + for (; list != nullptr; list = list->Rest()) + { + assert(list->Current()->OperIsPutArg()); } - break; + } + break; default: assert(arg->OperIsPutArg()); |