diff options
Diffstat (limited to 'src/jit/lower.cpp')
-rw-r--r-- | src/jit/lower.cpp | 724 |
1 files changed, 449 insertions, 275 deletions
diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index a6e50b304c..0316a34a21 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -167,8 +167,13 @@ GenTree* Lowering::LowerNode(GenTree* node) case GT_STORE_BLK: case GT_STORE_OBJ: case GT_STORE_DYN_BLK: - LowerBlockStore(node->AsBlk()); - break; + { + // TODO-Cleanup: Consider moving this code to LowerBlockStore, which is currently + // called from TreeNodeInfoInitBlockStore, and calling that method here. + GenTreeBlk* blkNode = node->AsBlk(); + TryCreateAddrMode(LIR::Use(BlockRange(), &blkNode->Addr(), blkNode), false); + } + break; #ifdef FEATURE_SIMD case GT_SIMD: @@ -236,20 +241,14 @@ GenTree* Lowering::LowerNode(GenTree* node) unsigned varNum = node->AsLclVarCommon()->GetLclNum(); LclVarDsc* varDsc = &comp->lvaTable[varNum]; -#if defined(_TARGET_64BIT_) - assert(varDsc->lvSize() == 16); - node->gtType = TYP_SIMD16; -#else // !_TARGET_64BIT_ - if (varDsc->lvSize() == 16) + if (comp->lvaMapSimd12ToSimd16(varDsc)) { + JITDUMP("Mapping TYP_SIMD12 lclvar node to TYP_SIMD16:\n"); + DISPNODE(node); + JITDUMP("============"); + node->gtType = TYP_SIMD16; } - else - { - // The following assert is guaranteed by lvSize(). - assert(varDsc->lvIsParam); - } -#endif // !_TARGET_64BIT_ } #endif // FEATURE_SIMD __fallthrough; @@ -549,7 +548,7 @@ GenTree* Lowering::LowerSwitch(GenTree* node) // If the number of possible destinations is small enough, we proceed to expand the switch // into a series of conditional branches, otherwise we follow the jump table based switch // transformation. - else if (jumpCnt < minSwitchTabJumpCnt) + else if ((jumpCnt < minSwitchTabJumpCnt) || comp->compStressCompile(Compiler::STRESS_SWITCH_CMP_BR_EXPANSION, 50)) { // Lower the switch into a series of compare and branch IR trees. // @@ -639,7 +638,7 @@ GenTree* Lowering::LowerSwitch(GenTree* node) GenTreePtr gtCaseBranch = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, gtCaseCond); LIR::Range caseRange = LIR::SeqTree(comp, gtCaseBranch); - currentBBRange->InsertAtEnd(std::move(condRange)); + currentBBRange->InsertAtEnd(std::move(caseRange)); } } @@ -757,16 +756,6 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP GenTreePtr putArg = nullptr; bool updateArgTable = true; -#if !defined(_TARGET_64BIT_) - if (varTypeIsLong(type)) - { - // For TYP_LONG, we leave the GT_LONG as the arg, and put the putArg below it. - // Therefore, we don't update the arg table entry. - updateArgTable = false; - type = TYP_INT; - } -#endif // !defined(_TARGET_64BIT_) - bool isOnStack = true; #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING if (varTypeIsStruct(type)) @@ -954,6 +943,11 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots) DEBUGARG(call)); #endif +#if defined(UNIX_X86_ABI) + assert((info->padStkAlign > 0 && info->numSlots > 0) || (info->padStkAlign == 0)); + putArg->AsPutArgStk()->setArgPadding(info->padStkAlign); +#endif + #ifdef FEATURE_PUT_STRUCT_ARG_STK // If the ArgTabEntry indicates that this arg is a struct // get and store the number of slots that are references. @@ -1084,25 +1078,22 @@ void Lowering::LowerArg(GenTreeCall* call, GenTreePtr* ppArg) NYI("Lowering of long register argument"); } - // For longs, we will create two PUTARG_STKs below the GT_LONG. The hi argument needs to - // be pushed first, so the hi PUTARG_STK will precede the lo PUTARG_STK in execution order. + // For longs, we will replace the GT_LONG with a GT_FIELD_LIST, and put that under a PUTARG_STK. + // Although the hi argument needs to be pushed first, that will be handled by the general case, + // in which the fields will be reversed. noway_assert(arg->OperGet() == GT_LONG); - GenTreePtr argLo = arg->gtGetOp1(); - GenTreePtr argHi = arg->gtGetOp2(); - - GenTreePtr putArgLo = NewPutArg(call, argLo, info, type); - GenTreePtr putArgHi = NewPutArg(call, argHi, info, type); - - arg->gtOp.gtOp1 = putArgLo; - arg->gtOp.gtOp2 = putArgHi; - - BlockRange().InsertBefore(arg, putArgHi, putArgLo); - - // The execution order now looks like this: - // argLoPrev <-> argLoFirst ... argLo <-> argHiFirst ... argHi <-> putArgHi <-> putArgLo <-> arg(GT_LONG) - - assert((arg->gtFlags & GTF_REVERSE_OPS) == 0); - arg->gtFlags |= GTF_REVERSE_OPS; // We consume the high arg (op2) first. + assert(info->numSlots == 2); + GenTreePtr argLo = arg->gtGetOp1(); + GenTreePtr argHi = arg->gtGetOp2(); + GenTreeFieldList* fieldList = new (comp, GT_FIELD_LIST) GenTreeFieldList(argLo, 0, TYP_INT, nullptr); + // Only the first fieldList node (GTF_FIELD_LIST_HEAD) is in the instruction sequence. + (void)new (comp, GT_FIELD_LIST) GenTreeFieldList(argHi, 4, TYP_INT, fieldList); + putArg = NewPutArg(call, fieldList, info, TYP_VOID); + + // We can't call ReplaceArgWithPutArgOrCopy here because it presumes that we are keeping the original arg. + BlockRange().InsertBefore(arg, fieldList, putArg); + BlockRange().Remove(arg); + *ppArg = putArg; } else #endif // !defined(_TARGET_64BIT_) @@ -1872,6 +1863,7 @@ GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget bool isClosed; LIR::ReadOnlyRange secondArgRange = BlockRange().GetTreeRange(arg0, &isClosed); assert(isClosed); + BlockRange().Remove(std::move(secondArgRange)); argEntry->node->gtOp.gtOp1 = callTarget; @@ -1935,251 +1927,439 @@ GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget } //------------------------------------------------------------------------ -// Lowering::LowerCompare: lowers a compare node. -// -// For 64-bit targets, this doesn't do much of anything: all comparisons -// that we support can be handled in code generation on such targets. -// -// For 32-bit targets, however, any comparison that feeds a `GT_JTRUE` -// node must be lowered such that the liveness of the operands to the -// comparison is properly visible to the rest of the backend. As such, -// a 64-bit comparison is lowered from something like this: -// -// ------------ BB02 [004..014) -> BB02 (cond), preds={BB02,BB01} succs={BB03,BB02} -// N001 ( 1, 1) [000006] ------------ t6 = lclVar int V02 loc0 u:5 $148 -// -// /--* t6 int -// N002 ( 2, 3) [000007] ---------U-- t7 = * cast long <- ulong <- uint $3c0 +// Lowering::LowerCompare: Lowers a compare node. // -// N003 ( 3, 10) [000009] ------------ t9 = lconst long 0x0000000000000003 $101 -// -// /--* t7 long -// +--* t9 long -// N004 ( 9, 17) [000010] N------N-U-- t10 = * < int $149 -// -// /--* t10 int -// N005 ( 11, 19) [000011] ------------ * jmpTrue void -// -// To something like this: -// -// ------------ BB02 [004..014) -> BB03 (cond), preds={BB06,BB07,BB01} succs={BB06,BB03} -// [000099] ------------ t99 = const int 0 -// -// [000101] ------------ t101 = const int 0 -// -// /--* t99 int -// +--* t101 int -// N004 ( 9, 17) [000010] N------N-U-- t10 = * > int $149 -// -// /--* t10 int -// N005 ( 11, 19) [000011] ------------ * jmpTrue void -// -// -// ------------ BB06 [???..???) -> BB02 (cond), preds={BB02} succs={BB07,BB02} -// [000105] -------N-U-- jcc void cond=< -// -// -// ------------ BB07 [???..???) -> BB02 (cond), preds={BB06} succs={BB03,BB02} -// N001 ( 1, 1) [000006] ------------ t6 = lclVar int V02 loc0 u:5 $148 -// -// N003 ( 3, 10) [000009] ------------ t9 = const int 3 -// -// /--* t6 int -// +--* t9 int -// [000106] N------N-U-- t106 = * < int -// -// /--* t106 int -// [000107] ------------ * jmpTrue void -// -// Which will eventually generate code similar to the following: -// -// 33DB xor ebx, ebx -// 85DB test ebx, ebx -// 7707 ja SHORT G_M50523_IG04 -// 72E7 jb SHORT G_M50523_IG03 -// 83F803 cmp eax, 3 -// 72E2 jb SHORT G_M50523_IG03 +// Arguments: +// cmp - the compare node // +// Notes: +// - Decomposes long comparisons that feed a GT_JTRUE (32 bit specific). +// - Ensures that we don't have a mix of int/long operands (XARCH specific). +// - Narrow operands to enable memory operand containment (XARCH specific). +// - Transform cmp(and(x, y), 0) into test(x, y) (XARCH specific but could +// be used for ARM as well if support for GT_TEST_EQ/GT_TEST_NE is added). + void Lowering::LowerCompare(GenTree* cmp) { #ifndef _TARGET_64BIT_ - if (cmp->gtGetOp1()->TypeGet() != TYP_LONG) - { - return; - } - LIR::Use cmpUse; - if (!BlockRange().TryGetUse(cmp, &cmpUse) || cmpUse.User()->OperGet() != GT_JTRUE) + if ((cmp->gtGetOp1()->TypeGet() == TYP_LONG) && BlockRange().TryGetUse(cmp, &cmpUse) && + cmpUse.User()->OperIs(GT_JTRUE)) { - return; - } + // For 32-bit targets any comparison that feeds a `GT_JTRUE` node must be lowered such that + // the liveness of the operands to the comparison is properly visible to the rest of the + // backend. As such, a 64-bit comparison is lowered from something like this: + // + // ------------ BB02 [004..014) -> BB02 (cond), preds={BB02,BB01} succs={BB03,BB02} + // N001 ( 1, 1) [000006] ------------ t6 = lclVar int V02 loc0 u:5 $148 + // + // /--* t6 int + // N002 ( 2, 3) [000007] ---------U-- t7 = * cast long <- ulong <- uint $3c0 + // + // N003 ( 3, 10) [000009] ------------ t9 = lconst long 0x0000000000000003 $101 + // + // /--* t7 long + // +--* t9 long + // N004 ( 9, 17) [000010] N------N-U-- t10 = * < int $149 + // + // /--* t10 int + // N005 ( 11, 19) [000011] ------------ * jmpTrue void + // + // To something like this: + // + // ------------ BB02 [004..014) -> BB03 (cond), preds={BB06,BB07,BB01} succs={BB06,BB03} + // [000099] ------------ t99 = const int 0 + // + // [000101] ------------ t101 = const int 0 + // + // /--* t99 int + // +--* t101 int + // N004 ( 9, 17) [000010] N------N-U-- t10 = * > int $149 + // + // /--* t10 int + // N005 ( 11, 19) [000011] ------------ * jmpTrue void + // + // + // ------------ BB06 [???..???) -> BB02 (cond), preds={BB02} succs={BB07,BB02} + // [000105] -------N-U-- jcc void cond=< + // + // + // ------------ BB07 [???..???) -> BB02 (cond), preds={BB06} succs={BB03,BB02} + // N001 ( 1, 1) [000006] ------------ t6 = lclVar int V02 loc0 u:5 $148 + // + // N003 ( 3, 10) [000009] ------------ t9 = const int 3 + // + // /--* t6 int + // +--* t9 int + // [000106] N------N-U-- t106 = * < int + // + // /--* t106 int + // [000107] ------------ * jmpTrue void + // + // Which will eventually generate code similar to the following: + // + // 33DB xor ebx, ebx + // 85DB test ebx, ebx + // 7707 ja SHORT G_M50523_IG04 + // 72E7 jb SHORT G_M50523_IG03 + // 83F803 cmp eax, 3 + // 72E2 jb SHORT G_M50523_IG03 + // - GenTree* src1 = cmp->gtGetOp1(); - GenTree* src2 = cmp->gtGetOp2(); - unsigned weight = m_block->getBBWeight(comp); + GenTree* src1 = cmp->gtGetOp1(); + GenTree* src2 = cmp->gtGetOp2(); + unsigned weight = m_block->getBBWeight(comp); - LIR::Use loSrc1(BlockRange(), &(src1->gtOp.gtOp1), src1); - LIR::Use loSrc2(BlockRange(), &(src2->gtOp.gtOp1), src2); + LIR::Use loSrc1(BlockRange(), &(src1->gtOp.gtOp1), src1); + LIR::Use loSrc2(BlockRange(), &(src2->gtOp.gtOp1), src2); - if (loSrc1.Def()->OperGet() != GT_CNS_INT && loSrc1.Def()->OperGet() != GT_LCL_VAR) - { - loSrc1.ReplaceWithLclVar(comp, weight); - } + // TODO-CQ-32bit: We should move more code to the new basic block, currently we're only moving + // constants and lclvars. In particular, it would be nice to move GT_AND nodes as that would + // enable the and-cmp to test transform that happens later in this function. Though that's not + // exactly ideal, the and-cmp to test transform should run before this code but: + // - it would need to run before decomposition otherwise it won't recognize the 0 constant + // because after decomposition it is packed in a GT_LONG + // - this code would also need to handle GT_TEST_EQ/GT_TEST_NE - if (loSrc2.Def()->OperGet() != GT_CNS_INT && loSrc2.Def()->OperGet() != GT_LCL_VAR) - { - loSrc2.ReplaceWithLclVar(comp, weight); - } + if (!loSrc1.Def()->OperIs(GT_CNS_INT, GT_LCL_VAR)) + { + loSrc1.ReplaceWithLclVar(comp, weight); + } + + if (!loSrc2.Def()->OperIs(GT_CNS_INT, GT_LCL_VAR)) + { + loSrc2.ReplaceWithLclVar(comp, weight); + } - BasicBlock* jumpDest = m_block->bbJumpDest; - BasicBlock* nextDest = m_block->bbNext; - BasicBlock* newBlock = comp->fgSplitBlockAtEnd(m_block); + BasicBlock* jumpDest = m_block->bbJumpDest; + BasicBlock* nextDest = m_block->bbNext; + BasicBlock* newBlock = comp->fgSplitBlockAtEnd(m_block); - cmp->gtType = TYP_INT; - cmp->gtOp.gtOp1 = src1->gtOp.gtOp2; - cmp->gtOp.gtOp2 = src2->gtOp.gtOp2; + cmp->gtType = TYP_INT; + cmp->gtOp.gtOp1 = src1->gtOp.gtOp2; + cmp->gtOp.gtOp2 = src2->gtOp.gtOp2; - if (cmp->OperGet() == GT_EQ || cmp->OperGet() == GT_NE) - { - // 64-bit equality comparisons (no matter the polarity) require two 32-bit comparisons: one for the upper 32 - // bits and one for the lower 32 bits. As such, we update the flow graph like so: - // - // Before: - // BB0: cond - // / \ - // false true - // | | - // BB1 BB2 - // - // After: - // BB0: cond(hi) - // / \ - // false true - // | | - // | BB3: cond(lo) - // | / \ - // | false true - // \ / | - // BB1 BB2 - // + if (cmp->OperIs(GT_EQ, GT_NE)) + { + // 64-bit equality comparisons (no matter the polarity) require two 32-bit comparisons: one for the upper 32 + // bits and one for the lower 32 bits. As such, we update the flow graph like so: + // + // Before: + // BB0: cond + // / \ + // false true + // | | + // BB1 BB2 + // + // After: + // BB0: cond(hi) + // / \ + // false true + // | | + // | BB3: cond(lo) + // | / \ + // | false true + // \ / | + // BB1 BB2 + // - BlockRange().Remove(loSrc1.Def()); - BlockRange().Remove(loSrc2.Def()); - GenTree* loCmp = comp->gtNewOperNode(cmp->OperGet(), TYP_INT, loSrc1.Def(), loSrc2.Def()); - loCmp->gtFlags = cmp->gtFlags; - GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp); - LIR::AsRange(newBlock).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue); + BlockRange().Remove(loSrc1.Def()); + BlockRange().Remove(loSrc2.Def()); + GenTree* loCmp = comp->gtNewOperNode(cmp->OperGet(), TYP_INT, loSrc1.Def(), loSrc2.Def()); + loCmp->gtFlags = cmp->gtFlags; + GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp); + LIR::AsRange(newBlock).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue); - m_block->bbJumpKind = BBJ_COND; + m_block->bbJumpKind = BBJ_COND; + + if (cmp->OperIs(GT_EQ)) + { + cmp->gtOper = GT_NE; + m_block->bbJumpDest = nextDest; + nextDest->bbFlags |= BBF_JMP_TARGET; + comp->fgAddRefPred(nextDest, m_block); + } + else + { + m_block->bbJumpDest = jumpDest; + comp->fgAddRefPred(jumpDest, m_block); + } - if (cmp->OperGet() == GT_EQ) + assert(newBlock->bbJumpKind == BBJ_COND); + assert(newBlock->bbJumpDest == jumpDest); + } + else { - cmp->gtOper = GT_NE; + // 64-bit ordinal comparisons are more complicated: they require two comparisons for the upper 32 bits and + // one comparison for the lower 32 bits. We update the flowgraph as such: + // + // Before: + // BB0: cond + // / \ + // false true + // | | + // BB1 BB2 + // + // After: + // BB0: (!cond(hi) && !eq(hi)) + // / \ + // true false + // | | + // | BB3: (cond(hi) && !eq(hi)) + // | / \ + // | false true + // | | | + // | BB4: cond(lo) | + // | / \ | + // | false true | + // \ / \ / + // BB1 BB2 + // + // + // Note that the actual comparisons used to implement "(!cond(hi) && !eq(hi))" and "(cond(hi) && !eq(hi))" + // differ based on the original condition, and all consist of a single node. The switch statement below + // performs the necessary mapping. + // + + genTreeOps hiCmpOper; + genTreeOps loCmpOper; + + switch (cmp->OperGet()) + { + case GT_LT: + cmp->gtOper = GT_GT; + hiCmpOper = GT_LT; + loCmpOper = GT_LT; + break; + case GT_LE: + cmp->gtOper = GT_GT; + hiCmpOper = GT_LT; + loCmpOper = GT_LE; + break; + case GT_GT: + cmp->gtOper = GT_LT; + hiCmpOper = GT_GT; + loCmpOper = GT_GT; + break; + case GT_GE: + cmp->gtOper = GT_LT; + hiCmpOper = GT_GT; + loCmpOper = GT_GE; + break; + default: + unreached(); + } + + BasicBlock* newBlock2 = comp->fgSplitBlockAtEnd(newBlock); + + GenTree* hiJcc = new (comp, GT_JCC) GenTreeJumpCC(hiCmpOper); + hiJcc->gtFlags = cmp->gtFlags; + LIR::AsRange(newBlock).InsertAfter(nullptr, hiJcc); + + BlockRange().Remove(loSrc1.Def()); + BlockRange().Remove(loSrc2.Def()); + GenTree* loCmp = comp->gtNewOperNode(loCmpOper, TYP_INT, loSrc1.Def(), loSrc2.Def()); + loCmp->gtFlags = cmp->gtFlags | GTF_UNSIGNED; + GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp); + LIR::AsRange(newBlock2).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue); + + m_block->bbJumpKind = BBJ_COND; m_block->bbJumpDest = nextDest; nextDest->bbFlags |= BBF_JMP_TARGET; comp->fgAddRefPred(nextDest, m_block); + + newBlock->bbJumpKind = BBJ_COND; + newBlock->bbJumpDest = jumpDest; + comp->fgAddRefPred(jumpDest, newBlock); + + assert(newBlock2->bbJumpKind == BBJ_COND); + assert(newBlock2->bbJumpDest == jumpDest); } - else + + BlockRange().Remove(src1); + BlockRange().Remove(src2); + } +#endif + +#ifdef _TARGET_XARCH_ +#ifdef _TARGET_AMD64_ + if (cmp->gtGetOp1()->TypeGet() != cmp->gtGetOp2()->TypeGet()) + { + bool op1Is64Bit = (genTypeSize(cmp->gtGetOp1()->TypeGet()) == 8); + bool op2Is64Bit = (genTypeSize(cmp->gtGetOp2()->TypeGet()) == 8); + + if (op1Is64Bit != op2Is64Bit) { - m_block->bbJumpDest = jumpDest; - comp->fgAddRefPred(jumpDest, m_block); - } + // + // Normally this should not happen. IL allows comparing int32 to native int but the importer + // automatically inserts a cast from int32 to long on 64 bit architectures. However, the JIT + // accidentally generates int/long comparisons internally: + // - loop cloning compares int (and even small int) index limits against long constants + // - switch lowering compares a 64 bit switch value against a int32 constant + // + // TODO-Cleanup: The above mentioned issues should be fixed and then the code below may be + // replaced with an assert or at least simplified. The special casing of constants in code + // below is only necessary to prevent worse code generation for switches and loop cloning. + // - assert(newBlock->bbJumpKind == BBJ_COND); - assert(newBlock->bbJumpDest == jumpDest); + GenTree* longOp = op1Is64Bit ? cmp->gtOp.gtOp1 : cmp->gtOp.gtOp2; + GenTree** smallerOpUse = op2Is64Bit ? &cmp->gtOp.gtOp1 : &cmp->gtOp.gtOp2; + var_types smallerType = (*smallerOpUse)->TypeGet(); + + assert(genTypeSize(smallerType) < 8); + + if (longOp->IsCnsIntOrI() && genTypeCanRepresentValue(smallerType, longOp->AsIntCon()->IconValue())) + { + longOp->gtType = smallerType; + } + else if ((*smallerOpUse)->IsCnsIntOrI()) + { + (*smallerOpUse)->gtType = TYP_LONG; + } + else + { + GenTree* cast = comp->gtNewCastNode(TYP_LONG, *smallerOpUse, TYP_LONG); + *smallerOpUse = cast; + BlockRange().InsertAfter(cast->gtGetOp1(), cast); + } + } } - else +#endif // _TARGET_AMD64_ + + if (cmp->gtGetOp2()->IsIntegralConst()) { - // 64-bit ordinal comparisons are more complicated: they require two comparisons for the upper 32 bits and one - // comparison for the lower 32 bits. We update the flowgraph as such: - // - // Before: - // BB0: cond - // / \ - // false true - // | | - // BB1 BB2 - // - // After: - // BB0: (!cond(hi) && !eq(hi)) - // / \ - // true false - // | | - // | BB3: (cond(hi) && !eq(hi)) - // | / \ - // | false true - // | | | - // | BB4: cond(lo) | - // | / \ | - // | false true | - // \ / \ / - // BB1 BB2 - // - // - // Note that the actual comparisons used to implement "(!cond(hi) && !eq(hi))" and "(cond(hi) && !eq(hi))" - // differ based on the original condition, and all consist of a single node. The switch statement below - // performs the necessary mapping. - // + GenTree* op1 = cmp->gtGetOp1(); + var_types op1Type = op1->TypeGet(); + GenTreeIntCon* op2 = cmp->gtGetOp2()->AsIntCon(); + ssize_t op2Value = op2->IconValue(); - genTreeOps hiCmpOper; - genTreeOps loCmpOper; + if (op1->isMemoryOp() && varTypeIsSmall(op1Type) && genTypeCanRepresentValue(op1Type, op2Value)) + { + // + // If op1's type is small then try to narrow op2 so it has the same type as op1. + // Small types are usually used by memory loads and if both compare operands have + // the same type then the memory load can be contained. In certain situations + // (e.g "cmp ubyte, 200") we also get a smaller instruction encoding. + // - switch (cmp->OperGet()) + op2->gtType = op1Type; + } + else if (op1->OperIs(GT_CAST) && !op1->gtOverflow()) { - case GT_LT: - cmp->gtOper = GT_GT; - hiCmpOper = GT_LT; - loCmpOper = GT_LT; - break; - case GT_LE: - cmp->gtOper = GT_GT; - hiCmpOper = GT_LT; - loCmpOper = GT_LE; - break; - case GT_GT: - cmp->gtOper = GT_LT; - hiCmpOper = GT_GT; - loCmpOper = GT_GT; - break; - case GT_GE: - cmp->gtOper = GT_LT; - hiCmpOper = GT_GT; - loCmpOper = GT_GE; - break; - default: - unreached(); + GenTreeCast* cast = op1->AsCast(); + var_types castToType = cast->CastToType(); + GenTree* castOp = cast->gtGetOp1(); + + if (((castToType == TYP_BOOL) || (castToType == TYP_UBYTE)) && FitsIn<UINT8>(op2Value)) + { + // + // Since we're going to remove the cast we need to be able to narrow the cast operand + // to the cast type. This can be done safely only for certain opers (e.g AND, OR, XOR). + // Some opers just can't be narrowed (e.g DIV, MUL) while other could be narrowed but + // doing so would produce incorrect results (e.g. RSZ, RSH). + // + // The below list of handled opers is conservative but enough to handle the most common + // situations. In particular this include CALL, sometimes the JIT unnecessarilly widens + // the result of bool returning calls. + // + + if (castOp->OperIs(GT_CALL, GT_LCL_VAR) || castOp->OperIsLogical() || castOp->isMemoryOp()) + { + assert(!castOp->gtOverflowEx()); // Must not be an overflow checking operation + + castOp->gtType = castToType; + cmp->gtOp.gtOp1 = castOp; + op2->gtType = castToType; + + BlockRange().Remove(cast); + } + } } + else if (op1->OperIs(GT_AND) && cmp->OperIs(GT_EQ, GT_NE)) + { + // + // Transform ((x AND y) EQ|NE 0) into (x TEST_EQ|TEST_NE y) when possible. + // - BasicBlock* newBlock2 = comp->fgSplitBlockAtEnd(newBlock); + GenTree* andOp1 = op1->gtGetOp1(); + GenTree* andOp2 = op1->gtGetOp2(); - GenTree* hiJcc = new (comp, GT_JCC) GenTreeJumpCC(hiCmpOper); - hiJcc->gtFlags = cmp->gtFlags; - LIR::AsRange(newBlock).InsertAfter(nullptr, hiJcc); + if (op2Value != 0) + { + // + // If we don't have a 0 compare we can get one by transforming ((x AND mask) EQ|NE mask) + // into ((x AND mask) NE|EQ 0) when mask is a single bit. + // - BlockRange().Remove(loSrc1.Def()); - BlockRange().Remove(loSrc2.Def()); - GenTree* loCmp = comp->gtNewOperNode(loCmpOper, TYP_INT, loSrc1.Def(), loSrc2.Def()); - loCmp->gtFlags = cmp->gtFlags | GTF_UNSIGNED; - GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp); - LIR::AsRange(newBlock2).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue); + if (isPow2(static_cast<size_t>(op2Value)) && andOp2->IsIntegralConst(op2Value)) + { + op2Value = 0; + op2->SetIconValue(0); + cmp->SetOperRaw(GenTree::ReverseRelop(cmp->OperGet())); + } + } - m_block->bbJumpKind = BBJ_COND; - m_block->bbJumpDest = nextDest; - nextDest->bbFlags |= BBF_JMP_TARGET; - comp->fgAddRefPred(nextDest, m_block); + if (op2Value == 0) + { + BlockRange().Remove(op1); + BlockRange().Remove(op2); - newBlock->bbJumpKind = BBJ_COND; - newBlock->bbJumpDest = jumpDest; - comp->fgAddRefPred(jumpDest, newBlock); + cmp->SetOperRaw(cmp->OperIs(GT_EQ) ? GT_TEST_EQ : GT_TEST_NE); + cmp->gtOp.gtOp1 = andOp1; + cmp->gtOp.gtOp2 = andOp2; - assert(newBlock2->bbJumpKind == BBJ_COND); - assert(newBlock2->bbJumpDest == jumpDest); + if (andOp1->isMemoryOp() && andOp2->IsIntegralConst()) + { + // + // For "test" we only care about the bits that are set in the second operand (mask). + // If the mask fits in a small type then we can narrow both operands to generate a "test" + // instruction with a smaller encoding ("test" does not have a r/m32, imm8 form) and avoid + // a widening load in some cases. + // + // For 16 bit operands we narrow only if the memory operand is already 16 bit. This matches + // the behavior of a previous implementation and avoids adding more cases where we generate + // 16 bit instructions that require a length changing prefix (0x66). These suffer from + // significant decoder stalls on Intel CPUs. + // + // We could also do this for 64 bit masks that fit into 32 bit but it doesn't help. + // In such cases morph narrows down the existing GT_AND by inserting a cast between it and + // the memory operand so we'd need to add more code to recognize and eliminate that cast. + // + + size_t mask = static_cast<size_t>(andOp2->AsIntCon()->IconValue()); + + if (FitsIn<UINT8>(mask)) + { + andOp1->gtType = TYP_UBYTE; + andOp2->gtType = TYP_UBYTE; + } + else if (FitsIn<UINT16>(mask) && genTypeSize(andOp1) == 2) + { + andOp1->gtType = TYP_CHAR; + andOp2->gtType = TYP_CHAR; + } + } + } + } } - BlockRange().Remove(src1); - BlockRange().Remove(src2); -#endif + if (cmp->gtGetOp1()->TypeGet() == cmp->gtGetOp2()->TypeGet()) + { + if (varTypeIsSmall(cmp->gtGetOp1()->TypeGet()) && varTypeIsUnsigned(cmp->gtGetOp1()->TypeGet())) + { + // + // If both operands have the same type then codegen will use the common operand type to + // determine the instruction type. For small types this would result in performing a + // signed comparison of two small unsigned values without zero extending them to TYP_INT + // which is incorrect. Note that making the comparison unsigned doesn't imply that codegen + // has to generate a small comparison, it can still correctly generate a TYP_INT comparison. + // + + cmp->gtFlags |= GTF_UNSIGNED; + } + } +#endif // _TARGET_XARCH_ } // Lower "jmp <method>" tail call to insert PInvoke method epilog if required. @@ -3498,18 +3678,19 @@ GenTree* Lowering::TryCreateAddrMode(LIR::Use&& use, bool isIndir) // make sure there are not any side effects between def of leaves and use if (!doAddrMode || AreSourcesPossiblyModifiedLocals(addr, base, index)) { - JITDUMP(" No addressing mode\n"); + JITDUMP("No addressing mode:\n "); + DISPNODE(addr); return addr; } GenTreePtr arrLength = nullptr; JITDUMP("Addressing mode:\n"); - JITDUMP(" Base\n"); + JITDUMP(" Base\n "); DISPNODE(base); if (index != nullptr) { - JITDUMP(" + Index * %u + %u\n", scale, offset); + JITDUMP(" + Index * %u + %u\n ", scale, offset); DISPNODE(index); } else @@ -4023,12 +4204,6 @@ void Lowering::LowerStoreInd(GenTree* node) node->AsStoreInd()->SetRMWStatusDefault(); } -void Lowering::LowerBlockStore(GenTreeBlk* blkNode) -{ - GenTree* src = blkNode->Data(); - TryCreateAddrMode(LIR::Use(BlockRange(), &blkNode->Addr(), blkNode), false); -} - //------------------------------------------------------------------------ // LowerArrElem: Lower a GT_ARR_ELEM node // @@ -4303,13 +4478,12 @@ void Lowering::DoPhase() m_block = block; for (GenTree* node : BlockRange().NonPhiNodes()) { -/* We increment the number position of each tree node by 2 to -* simplify the logic when there's the case of a tree that implicitly -* does a dual-definition of temps (the long case). In this case -* is easier to already have an idle spot to handle a dual-def instead -* of making some messy adjustments if we only increment the -* number position by one. -*/ + // We increment the number position of each tree node by 2 to simplify the logic when there's the case of + // a tree that implicitly does a dual-definition of temps (the long case). In this case it is easier to + // already have an idle spot to handle a dual-def instead of making some messy adjustments if we only + // increment the number position by one. + CLANG_FORMAT_COMMENT_ANCHOR; + #ifdef DEBUG node->gtSeqNum = currentLoc; #endif @@ -4633,13 +4807,8 @@ bool Lowering::NodesAreEquivalentLeaves(GenTreePtr tree1, GenTreePtr tree2) } } -#ifdef _TARGET_64BIT_ /** * Get common information required to handle a cast instruction - * - * Right now only supports 64 bit targets. In order to support 32 bit targets the - * switch statement needs work. - * */ void Lowering::getCastDescription(GenTreePtr treeNode, CastInfo* castInfo) { @@ -4675,7 +4844,6 @@ void Lowering::getCastDescription(GenTreePtr treeNode, CastInfo* castInfo) bool signCheckOnly = false; // Do we need to compare the value, or just check masks - switch (dstType) { default: @@ -4709,9 +4877,13 @@ void Lowering::getCastDescription(GenTreePtr treeNode, CastInfo* castInfo) } else { +#ifdef _TARGET_64BIT_ typeMask = 0xFFFFFFFF80000000LL; - typeMin = INT_MIN; - typeMax = INT_MAX; +#else + typeMask = 0x80000000; +#endif + typeMin = INT_MIN; + typeMax = INT_MAX; } break; @@ -4722,7 +4894,11 @@ void Lowering::getCastDescription(GenTreePtr treeNode, CastInfo* castInfo) } else { +#ifdef _TARGET_64BIT_ typeMask = 0xFFFFFFFF00000000LL; +#else + typeMask = 0x00000000; +#endif } break; @@ -4746,8 +4922,6 @@ void Lowering::getCastDescription(GenTreePtr treeNode, CastInfo* castInfo) } } -#endif // _TARGET_64BIT_ - #ifdef DEBUG void Lowering::DumpNodeInfoMap() { |