diff options
Diffstat (limited to 'src/jit/lowerxarch.cpp')
-rw-r--r-- | src/jit/lowerxarch.cpp | 3677 |
1 files changed, 10 insertions, 3667 deletions
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index 589cef482e..f89a3dfc7b 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -42,61 +42,11 @@ void Lowering::LowerRotate(GenTreePtr tree) // // Notes: // This involves: -// - Setting the appropriate candidates for a store of a multi-reg call return value. -// - Requesting an internal register for SIMD12 stores. -// - Handling of contained immediates and widening operations of unsigneds. +// - Widening operations of unsigneds. void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) { - TreeNodeInfo* info = &(storeLoc->gtLsraInfo); - - // Is this the case of var = call where call is returning - // a value in multiple return registers? GenTree* op1 = storeLoc->gtGetOp1(); - if (op1->IsMultiRegCall()) - { - // backend expects to see this case only for store lclvar. - assert(storeLoc->OperGet() == GT_STORE_LCL_VAR); - - // srcCount = number of registers in which the value is returned by call - GenTreeCall* call = op1->AsCall(); - ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc(); - info->srcCount = retTypeDesc->GetReturnRegCount(); - - // Call node srcCandidates = Bitwise-OR(allregs(GetReturnRegType(i))) for all i=0..RetRegCount-1 - regMaskTP srcCandidates = m_lsra->allMultiRegCallNodeRegs(call); - op1->gtLsraInfo.setSrcCandidates(m_lsra, srcCandidates); - return; - } - -#ifdef FEATURE_SIMD - if (varTypeIsSIMD(storeLoc)) - { - if (op1->IsCnsIntOrI()) - { - // InitBlk - MakeSrcContained(storeLoc, op1); - } - else if ((storeLoc->TypeGet() == TYP_SIMD12) && (storeLoc->OperGet() == GT_STORE_LCL_FLD)) - { - // Need an additional register to extract upper 4 bytes of Vector3. - info->internalFloatCount = 1; - info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs()); - - // In this case don't mark the operand as contained as we want it to - // be evaluated into an xmm register - } - return; - } -#endif // FEATURE_SIMD - - // If the source is a containable immediate, make it contained, unless it is - // an int-size or larger store of zero to memory, because we can generate smaller code - // by zeroing a register and then storing it. - if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(storeLoc))) - { - MakeSrcContained(storeLoc, op1); - } // Try to widen the ops if they are going into a local var. if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (storeLoc->gtOp1->gtOper == GT_CNS_INT)) @@ -148,1490 +98,8 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) } } -/** - * Takes care of annotating the register requirements - * for every TreeNodeInfo struct that maps to each tree node. - * Preconditions: - * LSRA Has been initialized and there is a TreeNodeInfo node - * already allocated and initialized for every tree in the IR. - * Postconditions: - * Every TreeNodeInfo instance has the right annotations on register - * requirements needed by LSRA to build the Interval Table (source, - * destination and internal [temp] register counts). - * This code is refactored originally from LSRA. - */ -void Lowering::TreeNodeInfoInit(GenTree* tree) -{ - LinearScan* l = m_lsra; - Compiler* compiler = comp; - - TreeNodeInfo* info = &(tree->gtLsraInfo); - - switch (tree->OperGet()) - { - GenTree* op1; - GenTree* op2; - - default: - TreeNodeInfoInitSimple(tree); - break; - - case GT_LCL_FLD: - case GT_LCL_VAR: - info->srcCount = 0; - info->dstCount = 1; - -#ifdef FEATURE_SIMD - // Need an additional register to read upper 4 bytes of Vector3. - if (tree->TypeGet() == TYP_SIMD12) - { - // We need an internal register different from targetReg in which 'tree' produces its result - // because both targetReg and internal reg will be in use at the same time. - info->internalFloatCount = 1; - info->isInternalRegDelayFree = true; - info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs()); - } -#endif - break; - - case GT_STORE_LCL_FLD: - case GT_STORE_LCL_VAR: -#ifdef _TARGET_X86_ - if (tree->gtGetOp1()->OperGet() == GT_LONG) - { - info->srcCount = 2; - } - else -#endif // _TARGET_X86_ - { - info->srcCount = 1; - } - info->dstCount = 0; - LowerStoreLoc(tree->AsLclVarCommon()); - break; - - case GT_BOX: - noway_assert(!"box should not exist here"); - // The result of 'op1' is also the final result - info->srcCount = 0; - info->dstCount = 0; - break; - - case GT_PHYSREGDST: - info->srcCount = 1; - info->dstCount = 0; - break; - - case GT_COMMA: - { - GenTreePtr firstOperand; - GenTreePtr secondOperand; - if (tree->gtFlags & GTF_REVERSE_OPS) - { - firstOperand = tree->gtOp.gtOp2; - secondOperand = tree->gtOp.gtOp1; - } - else - { - firstOperand = tree->gtOp.gtOp1; - secondOperand = tree->gtOp.gtOp2; - } - if (firstOperand->TypeGet() != TYP_VOID) - { - firstOperand->gtLsraInfo.isLocalDefUse = true; - firstOperand->gtLsraInfo.dstCount = 0; - } - if (tree->TypeGet() == TYP_VOID && secondOperand->TypeGet() != TYP_VOID) - { - secondOperand->gtLsraInfo.isLocalDefUse = true; - secondOperand->gtLsraInfo.dstCount = 0; - } - } - info->srcCount = 0; - info->dstCount = 0; - break; - - case GT_LIST: - case GT_FIELD_LIST: - case GT_ARGPLACE: - case GT_NO_OP: - case GT_START_NONGC: - case GT_PROF_HOOK: - info->srcCount = 0; - info->dstCount = 0; - break; - - case GT_CNS_DBL: - info->srcCount = 0; - info->dstCount = 1; - break; - -#if !defined(_TARGET_64BIT_) - - case GT_LONG: - if ((tree->gtLIRFlags & LIR::Flags::IsUnusedValue) != 0) - { - // An unused GT_LONG node needs to consume its sources. - info->srcCount = 2; - } - else - { - // Passthrough - info->srcCount = 0; - } - - info->dstCount = 0; - break; - -#endif // !defined(_TARGET_64BIT_) - - case GT_QMARK: - case GT_COLON: - info->srcCount = 0; - info->dstCount = 0; - unreached(); - break; - - case GT_RETURN: - TreeNodeInfoInitReturn(tree); - break; - - case GT_RETFILT: - if (tree->TypeGet() == TYP_VOID) - { - info->srcCount = 0; - info->dstCount = 0; - } - else - { - assert(tree->TypeGet() == TYP_INT); - - info->srcCount = 1; - info->dstCount = 0; - - info->setSrcCandidates(l, RBM_INTRET); - tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET); - } - break; - - // A GT_NOP is either a passthrough (if it is void, or if it has - // a child), but must be considered to produce a dummy value if it - // has a type but no child - case GT_NOP: - info->srcCount = 0; - if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr) - { - info->dstCount = 1; - } - else - { - info->dstCount = 0; - } - break; - - case GT_JTRUE: - { - info->srcCount = 0; - info->dstCount = 0; - - GenTree* cmp = tree->gtGetOp1(); - l->clearDstCount(cmp); - -#ifdef FEATURE_SIMD - // Say we have the following IR - // simdCompareResult = GT_SIMD((In)Equality, v1, v2) - // integerCompareResult = GT_EQ/NE(simdCompareResult, true/false) - // GT_JTRUE(integerCompareResult) - // - // In this case we don't need to generate code for GT_EQ_/NE, since SIMD (In)Equality - // intrinsic would set or clear Zero flag. - - genTreeOps cmpOper = cmp->OperGet(); - if (cmpOper == GT_EQ || cmpOper == GT_NE) - { - GenTree* cmpOp1 = cmp->gtGetOp1(); - GenTree* cmpOp2 = cmp->gtGetOp2(); - - if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1))) - { - // clear dstCount on SIMD node to indicate that - // result doesn't need to be materialized into a register. - l->clearOperandCounts(cmp); - l->clearDstCount(cmpOp1); - l->clearOperandCounts(cmpOp2); - - // Codegen of SIMD (in)Equality uses target integer reg - // only for setting flags. Target reg is not needed on AVX - // when comparing against Vector Zero. In all other cases - // we need to reserve an int type internal register, since we - // have cleared dstCount. - if (compiler->canUseAVX() && cmpOp1->gtGetOp2()->IsIntegralConstVector(0)) - { - // We don't need an internal register,since we use vptest - // for setting flags. - } - else - { - ++(cmpOp1->gtLsraInfo.internalIntCount); - regMaskTP internalCandidates = cmpOp1->gtLsraInfo.getInternalCandidates(l); - internalCandidates |= l->allRegs(TYP_INT); - cmpOp1->gtLsraInfo.setInternalCandidates(l, internalCandidates); - } - - // We would have to reverse compare oper in the following cases: - // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it. - // Therefore, if compare oper is == or != against false(0), we will - // be checking opposite of what is required. - // - // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it. - // Therefore, if compare oper is == or != against true(1), we will - // be checking opposite of what is required. - GenTreeSIMD* simdNode = cmpOp1->AsSIMD(); - if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) - { - if (cmpOp2->IsIntegralConst(0)) - { - cmp->SetOper(GenTree::ReverseRelop(cmpOper)); - } - } - else - { - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality); - if (cmpOp2->IsIntegralConst(1)) - { - cmp->SetOper(GenTree::ReverseRelop(cmpOper)); - } - } - } - } -#endif // FEATURE_SIMD - } - break; - - case GT_JCC: - info->srcCount = 0; - info->dstCount = 0; - break; - - case GT_JMP: - info->srcCount = 0; - info->dstCount = 0; - break; - - case GT_SWITCH: - // This should never occur since switch nodes must not be visible at this - // point in the JIT. - info->srcCount = 0; - info->dstCount = 0; // To avoid getting uninit errors. - noway_assert(!"Switch must be lowered at this point"); - break; - - case GT_JMPTABLE: - info->srcCount = 0; - info->dstCount = 1; - break; - - case GT_SWITCH_TABLE: - info->srcCount = 2; - info->internalIntCount = 1; - info->dstCount = 0; - break; - - case GT_ASG: - case GT_ASG_ADD: - case GT_ASG_SUB: - noway_assert(!"We should never hit any assignment operator in lowering"); - info->srcCount = 0; - info->dstCount = 0; - break; - -#if !defined(_TARGET_64BIT_) - case GT_ADD_LO: - case GT_ADD_HI: - case GT_SUB_LO: - case GT_SUB_HI: -#endif - case GT_ADD: - case GT_SUB: - // SSE2 arithmetic instructions doesn't support the form "op mem, xmm". - // Rather they only support "op xmm, mem/xmm" form. - if (varTypeIsFloating(tree->TypeGet())) - { - // overflow operations aren't supported on float/double types. - assert(!tree->gtOverflow()); - - op1 = tree->gtGetOp1(); - op2 = tree->gtGetOp2(); - - // No implicit conversions at this stage as the expectation is that - // everything is made explicit by adding casts. - assert(op1->TypeGet() == op2->TypeGet()); - - info->srcCount = 2; - info->dstCount = 1; - - if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl()) - { - MakeSrcContained(tree, op2); - } - else if (tree->OperIsCommutative() && - (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1)))) - { - // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands - // as long as it is safe so that the following efficient code sequence is generated: - // addss/sd targetReg, memOp (if op1Reg == targetReg) OR - // movaps targetReg, op2Reg; addss/sd targetReg, [memOp] - // - // Instead of - // movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg (if op1Reg == targetReg) OR - // movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg - MakeSrcContained(tree, op1); - } - else - { - // If there are no containable operands, we can make an operand reg optional. - SetRegOptionalForBinOp(tree); - } - break; - } - - __fallthrough; - - case GT_AND: - case GT_OR: - case GT_XOR: - TreeNodeInfoInitLogicalOp(tree); - break; - - case GT_RETURNTRAP: - // this just turns into a compare of its child with an int - // + a conditional call - info->srcCount = 1; - info->dstCount = 0; - if (tree->gtOp.gtOp1->isIndir()) - { - MakeSrcContained(tree, tree->gtOp.gtOp1); - } - info->internalIntCount = 1; - info->setInternalCandidates(l, l->allRegs(TYP_INT)); - break; - - case GT_MOD: - case GT_DIV: - case GT_UMOD: - case GT_UDIV: - TreeNodeInfoInitModDiv(tree); - break; - - case GT_MUL: - case GT_MULHI: -#if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND) - case GT_MUL_LONG: -#endif - SetMulOpCounts(tree); - break; - - case GT_INTRINSIC: - TreeNodeInfoInitIntrinsic(tree); - break; - -#ifdef FEATURE_SIMD - case GT_SIMD: - TreeNodeInfoInitSIMD(tree); - break; -#endif // FEATURE_SIMD - - case GT_CAST: - TreeNodeInfoInitCast(tree); - break; - - case GT_NEG: - info->srcCount = 1; - info->dstCount = 1; - - // TODO-XArch-CQ: - // SSE instruction set doesn't have an instruction to negate a number. - // The recommended way is to xor the float/double number with a bitmask. - // The only way to xor is using xorps or xorpd both of which operate on - // 128-bit operands. To hold the bit-mask we would need another xmm - // register or a 16-byte aligned 128-bit data constant. Right now emitter - // lacks the support for emitting such constants or instruction with mem - // addressing mode referring to a 128-bit operand. For now we use an - // internal xmm register to load 32/64-bit bitmask from data section. - // Note that by trading additional data section memory (128-bit) we can - // save on the need for an internal register and also a memory-to-reg - // move. - // - // Note: another option to avoid internal register requirement is by - // lowering as GT_SUB(0, src). This will generate code different from - // Jit64 and could possibly result in compat issues (?). - if (varTypeIsFloating(tree)) - { - info->internalFloatCount = 1; - info->setInternalCandidates(l, l->internalFloatRegCandidates()); - } - else - { - // Codegen of this tree node sets ZF and SF flags. - tree->gtFlags |= GTF_ZSF_SET; - } - break; - - case GT_NOT: - info->srcCount = 1; - info->dstCount = 1; - break; - - case GT_LSH: - case GT_RSH: - case GT_RSZ: - case GT_ROL: - case GT_ROR: -#ifdef _TARGET_X86_ - case GT_LSH_HI: - case GT_RSH_LO: -#endif - TreeNodeInfoInitShiftRotate(tree); - break; - - case GT_EQ: - case GT_NE: - case GT_LT: - case GT_LE: - case GT_GE: - case GT_GT: - TreeNodeInfoInitCmp(tree); - break; - - case GT_CKFINITE: - info->srcCount = 1; - info->dstCount = 1; - info->internalIntCount = 1; - break; - - case GT_CMPXCHG: - info->srcCount = 3; - info->dstCount = 1; - - // comparand is preferenced to RAX. - // Remaining two operands can be in any reg other than RAX. - tree->gtCmpXchg.gtOpComparand->gtLsraInfo.setSrcCandidates(l, RBM_RAX); - tree->gtCmpXchg.gtOpLocation->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX); - tree->gtCmpXchg.gtOpValue->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX); - tree->gtLsraInfo.setDstCandidates(l, RBM_RAX); - break; - - case GT_LOCKADD: - info->srcCount = 2; - info->dstCount = 0; - - CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2); - break; - - case GT_CALL: - TreeNodeInfoInitCall(tree->AsCall()); - break; - - case GT_ADDR: - { - // For a GT_ADDR, the child node should not be evaluated into a register - GenTreePtr child = tree->gtOp.gtOp1; - assert(!l->isCandidateLocalRef(child)); - l->clearDstCount(child); - info->srcCount = 0; - info->dstCount = 1; - } - break; - -#if !defined(FEATURE_PUT_STRUCT_ARG_STK) - case GT_OBJ: -#endif - case GT_BLK: - case GT_DYN_BLK: - // These should all be eliminated prior to Lowering. - assert(!"Non-store block node in Lowering"); - info->srcCount = 0; - info->dstCount = 0; - break; - -#ifdef FEATURE_PUT_STRUCT_ARG_STK - case GT_PUTARG_STK: - TreeNodeInfoInitPutArgStk(tree->AsPutArgStk()); - break; -#endif // FEATURE_PUT_STRUCT_ARG_STK - - case GT_STORE_BLK: - case GT_STORE_OBJ: - case GT_STORE_DYN_BLK: - TreeNodeInfoInitBlockStore(tree->AsBlk()); - break; - - case GT_INIT_VAL: - // Always a passthrough of its child's value. - info->srcCount = 0; - info->dstCount = 0; - break; - - case GT_LCLHEAP: - TreeNodeInfoInitLclHeap(tree); - break; - - case GT_ARR_BOUNDS_CHECK: -#ifdef FEATURE_SIMD - case GT_SIMD_CHK: -#endif // FEATURE_SIMD - { - GenTreeBoundsChk* node = tree->AsBoundsChk(); - // Consumes arrLen & index - has no result - info->srcCount = 2; - info->dstCount = 0; - - GenTreePtr other; - if (CheckImmedAndMakeContained(tree, node->gtIndex)) - { - other = node->gtArrLen; - } - else if (CheckImmedAndMakeContained(tree, node->gtArrLen)) - { - other = node->gtIndex; - } - else if (node->gtIndex->isMemoryOp()) - { - other = node->gtIndex; - } - else - { - other = node->gtArrLen; - } - - if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet()) - { - if (other->isMemoryOp()) - { - MakeSrcContained(tree, other); - } - else - { - // We can mark 'other' as reg optional, since it is not contained. - SetRegOptional(other); - } - } - } - break; - - case GT_ARR_ELEM: - // These must have been lowered to GT_ARR_INDEX - noway_assert(!"We should never see a GT_ARR_ELEM in lowering"); - info->srcCount = 0; - info->dstCount = 0; - break; - - case GT_ARR_INDEX: - info->srcCount = 2; - info->dstCount = 1; - // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple - // times while the result is being computed. - tree->AsArrIndex()->ArrObj()->gtLsraInfo.isDelayFree = true; - info->hasDelayFreeSrc = true; - break; - - case GT_ARR_OFFSET: - // This consumes the offset, if any, the arrObj and the effective index, - // and produces the flattened offset for this dimension. - info->srcCount = 3; - info->dstCount = 1; - - // we don't want to generate code for this - if (tree->gtArrOffs.gtOffset->IsIntegralConst(0)) - { - MakeSrcContained(tree, tree->gtArrOffs.gtOffset); - } - else - { - // Here we simply need an internal register, which must be different - // from any of the operand's registers, but may be the same as targetReg. - info->internalIntCount = 1; - } - break; - - case GT_LEA: - // The LEA usually passes its operands through to the GT_IND, in which case we'll - // clear the info->srcCount and info->dstCount later, but we may be instantiating an address, - // so we set them here. - info->srcCount = 0; - if (tree->AsAddrMode()->HasBase()) - { - info->srcCount++; - } - if (tree->AsAddrMode()->HasIndex()) - { - info->srcCount++; - } - info->dstCount = 1; - break; - - case GT_STOREIND: - { - info->srcCount = 2; - info->dstCount = 0; - GenTree* src = tree->gtOp.gtOp2; - - if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree)) - { - LowerGCWriteBarrier(tree); - break; - } - - // If the source is a containable immediate, make it contained, unless it is - // an int-size or larger store of zero to memory, because we can generate smaller code - // by zeroing a register and then storing it. - if (IsContainableImmed(tree, src) && - (!src->IsIntegralConst(0) || varTypeIsSmall(tree) || tree->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR)) - { - MakeSrcContained(tree, src); - } - else if (!varTypeIsFloating(tree)) - { - // Perform recognition of trees with the following structure: - // StoreInd(addr, BinOp(expr, GT_IND(addr))) - // to be able to fold this into an instruction of the form - // BINOP [addr], register - // where register is the actual place where 'expr' is computed. - // - // SSE2 doesn't support RMW form of instructions. - if (SetStoreIndOpCountsIfRMWMemOp(tree)) - { - break; - } - } - - SetIndirAddrOpCounts(tree); - } - break; - - case GT_NULLCHECK: - info->dstCount = 0; - info->srcCount = 1; - info->isLocalDefUse = true; - break; - - case GT_IND: - info->dstCount = 1; - info->srcCount = 1; - SetIndirAddrOpCounts(tree); - break; - - case GT_CATCH_ARG: - info->srcCount = 0; - info->dstCount = 1; - info->setDstCandidates(l, RBM_EXCEPTION_OBJECT); - break; - -#if !FEATURE_EH_FUNCLETS - case GT_END_LFIN: - info->srcCount = 0; - info->dstCount = 0; - break; -#endif - - case GT_CLS_VAR: - // These nodes are eliminated by rationalizer. - JITDUMP("Unexpected node %s in Lower.\n", GenTree::NodeName(tree->OperGet())); - unreached(); - break; - } // end switch (tree->OperGet()) - - // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1. - // Even then we would like to set isTgtPref on Op1. - if (tree->OperIsBinary() && info->srcCount >= 1) - { - if (isRMWRegOper(tree)) - { - GenTree* op1 = tree->gtOp.gtOp1; - GenTree* op2 = tree->gtOp.gtOp2; - - // Commutative opers like add/mul/and/or/xor could reverse the order of - // operands if it is safe to do so. In such a case we would like op2 to be - // target preferenced instead of op1. - if (tree->OperIsCommutative() && op1->gtLsraInfo.dstCount == 0 && op2 != nullptr) - { - op1 = op2; - op2 = tree->gtOp.gtOp1; - } - - // If we have a read-modify-write operation, we want to preference op1 to the target. - // If op1 is contained, we don't want to preference it, but it won't - // show up as a source in that case, so it will be ignored. - op1->gtLsraInfo.isTgtPref = true; - - // Is this a non-commutative operator, or is op2 a contained memory op? - // (Note that we can't call IsContained() at this point because it uses exactly the - // same information we're currently computing.) - // In either case, we need to make op2 remain live until the op is complete, by marking - // the source(s) associated with op2 as "delayFree". - // Note that if op2 of a binary RMW operator is a memory op, even if the operator - // is commutative, codegen cannot reverse them. - // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's - // more work to be done to correctly reverse the operands if they involve memory - // operands. Also, we may need to handle more cases than GT_IND, especially once - // we've modified the register allocator to not require all nodes to be assigned - // a register (e.g. a spilled lclVar can often be referenced directly from memory). - // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op. - - GenTree* delayUseSrc = nullptr; - // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have - // to special case them. - if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD) - { - delayUseSrc = op1; - } - else if ((op2 != nullptr) && - (!tree->OperIsCommutative() || (op2->isMemoryOp() && (op2->gtLsraInfo.srcCount == 0)))) - { - delayUseSrc = op2; - } - if (delayUseSrc != nullptr) - { - // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree' - // on the base & index, if any. - // Otherwise, we set it on delayUseSrc itself. - if (delayUseSrc->isIndir() && (delayUseSrc->gtLsraInfo.dstCount == 0)) - { - GenTree* base = delayUseSrc->AsIndir()->Base(); - GenTree* index = delayUseSrc->AsIndir()->Index(); - if (base != nullptr) - { - base->gtLsraInfo.isDelayFree = true; - } - if (index != nullptr) - { - index->gtLsraInfo.isDelayFree = true; - } - } - else - { - delayUseSrc->gtLsraInfo.isDelayFree = true; - } - info->hasDelayFreeSrc = true; - } - } - } - - TreeNodeInfoInitCheckByteable(tree); - - // We need to be sure that we've set info->srcCount and info->dstCount appropriately - assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT)); -} - -//------------------------------------------------------------------------ -// TreeNodeInfoInitCheckByteable: Check the tree to see if "byte-able" registers are -// required, and set the tree node info accordingly. -// -// Arguments: -// tree - The node of interest -// -// Return Value: -// None. -// -void Lowering::TreeNodeInfoInitCheckByteable(GenTree* tree) -{ -#ifdef _TARGET_X86_ - LinearScan* l = m_lsra; - TreeNodeInfo* info = &(tree->gtLsraInfo); - - // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands - // if the tree node is a byte type. - // - // Though this looks conservative in theory, in practice we could not think of a case where - // the below logic leads to conservative register specification. In future when or if we find - // one such case, this logic needs to be fine tuned for that case(s). - - if (ExcludeNonByteableRegisters(tree)) - { - regMaskTP regMask; - if (info->dstCount > 0) - { - regMask = info->getDstCandidates(l); - assert(regMask != RBM_NONE); - info->setDstCandidates(l, regMask & ~RBM_NON_BYTE_REGS); - } - - if (tree->OperIsSimple() && (info->srcCount > 0)) - { - // No need to set src candidates on a contained child operand. - GenTree* op = tree->gtOp.gtOp1; - assert(op != nullptr); - bool containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0); - if (!containedNode) - { - regMask = op->gtLsraInfo.getSrcCandidates(l); - assert(regMask != RBM_NONE); - op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS); - } - - if (tree->OperIsBinary() && (tree->gtOp.gtOp2 != nullptr)) - { - op = tree->gtOp.gtOp2; - containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0); - if (!containedNode) - { - regMask = op->gtLsraInfo.getSrcCandidates(l); - assert(regMask != RBM_NONE); - op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS); - } - } - } - } -#endif //_TARGET_X86_ -} - -//------------------------------------------------------------------------ -// TreeNodeInfoInitSimple: Sets the srcCount and dstCount for all the trees -// without special handling based on the tree node type. -// -// Arguments: -// tree - The node of interest -// -// Return Value: -// None. -// -void Lowering::TreeNodeInfoInitSimple(GenTree* tree) -{ - TreeNodeInfo* info = &(tree->gtLsraInfo); - unsigned kind = tree->OperKind(); - info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1; - if (kind & (GTK_CONST | GTK_LEAF)) - { - info->srcCount = 0; - } - else if (kind & (GTK_SMPOP)) - { - if (tree->gtGetOp2() != nullptr) - { - info->srcCount = 2; - } - else - { - info->srcCount = 1; - } - } - else - { - unreached(); - } -} - -//------------------------------------------------------------------------ -// TreeNodeInfoInitReturn: Set the NodeInfo for a GT_RETURN. -// -// Arguments: -// tree - The node of interest -// -// Return Value: -// None. -// -void Lowering::TreeNodeInfoInitReturn(GenTree* tree) -{ - TreeNodeInfo* info = &(tree->gtLsraInfo); - LinearScan* l = m_lsra; - Compiler* compiler = comp; - -#if !defined(_TARGET_64BIT_) - if (tree->TypeGet() == TYP_LONG) - { - GenTree* op1 = tree->gtGetOp1(); - noway_assert(op1->OperGet() == GT_LONG); - GenTree* loVal = op1->gtGetOp1(); - GenTree* hiVal = op1->gtGetOp2(); - info->srcCount = 2; - loVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_LO); - hiVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_HI); - info->dstCount = 0; - } - else -#endif // !defined(_TARGET_64BIT_) - { - GenTree* op1 = tree->gtGetOp1(); - regMaskTP useCandidates = RBM_NONE; - - info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1; - info->dstCount = 0; - -#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING - if (varTypeIsStruct(tree)) - { - // op1 has to be either an lclvar or a multi-reg returning call - if (op1->OperGet() == GT_LCL_VAR) - { - GenTreeLclVarCommon* lclVarCommon = op1->AsLclVarCommon(); - LclVarDsc* varDsc = &(compiler->lvaTable[lclVarCommon->gtLclNum]); - assert(varDsc->lvIsMultiRegRet); - - // Mark var as contained if not enregistrable. - if (!varTypeIsEnregisterableStruct(op1)) - { - MakeSrcContained(tree, op1); - } - } - else - { - noway_assert(op1->IsMultiRegCall()); - - ReturnTypeDesc* retTypeDesc = op1->AsCall()->GetReturnTypeDesc(); - info->srcCount = retTypeDesc->GetReturnRegCount(); - useCandidates = retTypeDesc->GetABIReturnRegs(); - } - } - else -#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING - { - // Non-struct type return - determine useCandidates - switch (tree->TypeGet()) - { - case TYP_VOID: - useCandidates = RBM_NONE; - break; - case TYP_FLOAT: - useCandidates = RBM_FLOATRET; - break; - case TYP_DOUBLE: - useCandidates = RBM_DOUBLERET; - break; -#if defined(_TARGET_64BIT_) - case TYP_LONG: - useCandidates = RBM_LNGRET; - break; -#endif // defined(_TARGET_64BIT_) - default: - useCandidates = RBM_INTRET; - break; - } - } - - if (useCandidates != RBM_NONE) - { - op1->gtLsraInfo.setSrcCandidates(l, useCandidates); - } - } -} - -//------------------------------------------------------------------------ -// TreeNodeInfoInitShiftRotate: Set the NodeInfo for a shift or rotate. -// -// Arguments: -// tree - The node of interest -// -// Return Value: -// None. -// -void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree) -{ - TreeNodeInfo* info = &(tree->gtLsraInfo); - LinearScan* l = m_lsra; - - info->srcCount = 2; - info->dstCount = 1; - - // For shift operations, we need that the number - // of bits moved gets stored in CL in case - // the number of bits to shift is not a constant. - GenTreePtr shiftBy = tree->gtOp.gtOp2; - GenTreePtr source = tree->gtOp.gtOp1; - -#ifdef _TARGET_X86_ - // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that - // we can have a three operand form. Increment the srcCount. - if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO) - { - assert(source->OperGet() == GT_LONG); - - info->srcCount++; - - if (tree->OperGet() == GT_LSH_HI) - { - GenTreePtr sourceLo = source->gtOp.gtOp1; - sourceLo->gtLsraInfo.isDelayFree = true; - } - else - { - GenTreePtr sourceHi = source->gtOp.gtOp2; - sourceHi->gtLsraInfo.isDelayFree = true; - } - - source->gtLsraInfo.hasDelayFreeSrc = true; - info->hasDelayFreeSrc = true; - } -#endif - - // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off) - // We will allow whatever can be encoded - hope you know what you are doing. - if (!IsContainableImmed(tree, shiftBy) || (shiftBy->gtIntConCommon.IconValue() > 255) || - (shiftBy->gtIntConCommon.IconValue() < 0)) - { - source->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX); - shiftBy->gtLsraInfo.setSrcCandidates(l, RBM_RCX); - info->setDstCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX); - } - else - { - MakeSrcContained(tree, shiftBy); - - // Note that Rotate Left/Right instructions don't set ZF and SF flags. - // - // If the operand being shifted is 32-bits then upper three bits are masked - // by hardware to get actual shift count. Similarly for 64-bit operands - // shift count is narrowed to [0..63]. If the resulting shift count is zero, - // then shift operation won't modify flags. - // - // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0) - // if the shift count is known to be non-zero and in the range depending on the - // operand size. - } -} - -//------------------------------------------------------------------------ -// TreeNodeInfoInitCall: Set the NodeInfo for a call. -// -// Arguments: -// call - The call node of interest -// -// Return Value: -// None. -// -void Lowering::TreeNodeInfoInitCall(GenTreeCall* call) -{ - TreeNodeInfo* info = &(call->gtLsraInfo); - LinearScan* l = m_lsra; - Compiler* compiler = comp; - bool hasMultiRegRetVal = false; - ReturnTypeDesc* retTypeDesc = nullptr; - - info->srcCount = 0; - if (call->TypeGet() != TYP_VOID) - { - hasMultiRegRetVal = call->HasMultiRegRetVal(); - if (hasMultiRegRetVal) - { - // dst count = number of registers in which the value is returned by call - retTypeDesc = call->GetReturnTypeDesc(); - info->dstCount = retTypeDesc->GetReturnRegCount(); - } - else - { - info->dstCount = 1; - } - } - else - { - info->dstCount = 0; - } - - GenTree* ctrlExpr = call->gtControlExpr; - if (call->gtCallType == CT_INDIRECT) - { - // either gtControlExpr != null or gtCallAddr != null. - // Both cannot be non-null at the same time. - assert(ctrlExpr == nullptr); - assert(call->gtCallAddr != nullptr); - ctrlExpr = call->gtCallAddr; - -#ifdef _TARGET_X86_ - // Fast tail calls aren't currently supported on x86, but if they ever are, the code - // below that handles indirect VSD calls will need to be fixed. - assert(!call->IsFastTailCall() || !call->IsVirtualStub()); -#endif // _TARGET_X86_ - } - - // set reg requirements on call target represented as control sequence. - if (ctrlExpr != nullptr) - { - // we should never see a gtControlExpr whose type is void. - assert(ctrlExpr->TypeGet() != TYP_VOID); - - // call can take a Rm op on x64 - info->srcCount++; - - // In case of fast tail implemented as jmp, make sure that gtControlExpr is - // computed into a register. - if (!call->IsFastTailCall()) - { -#ifdef _TARGET_X86_ - // On x86, we need to generate a very specific pattern for indirect VSD calls: - // - // 3-byte nop - // call dword ptr [eax] - // - // Where EAX is also used as an argument to the stub dispatch helper. Make - // sure that the call target address is computed into EAX in this case. - if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT)) - { - assert(ctrlExpr->isIndir()); - - ctrlExpr->gtGetOp1()->gtLsraInfo.setSrcCandidates(l, RBM_VIRTUAL_STUB_TARGET); - MakeSrcContained(call, ctrlExpr); - } - else -#endif // _TARGET_X86_ - if (ctrlExpr->isIndir()) - { - MakeSrcContained(call, ctrlExpr); - } - } - else - { - // Fast tail call - make sure that call target is always computed in RAX - // so that epilog sequence can generate "jmp rax" to achieve fast tail call. - ctrlExpr->gtLsraInfo.setSrcCandidates(l, RBM_RAX); - } - } - - // If this is a varargs call, we will clear the internal candidates in case we need - // to reserve some integer registers for copying float args. - // We have to do this because otherwise the default candidates are allRegs, and adding - // the individual specific registers will have no effect. - if (call->IsVarargs()) - { - info->setInternalCandidates(l, RBM_NONE); - } - - RegisterType registerType = call->TypeGet(); - - // Set destination candidates for return value of the call. - CLANG_FORMAT_COMMENT_ANCHOR; - -#ifdef _TARGET_X86_ - if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME)) - { - // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with - // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the - // correct argument registers. - info->setDstCandidates(l, RBM_PINVOKE_TCB); - } - else -#endif // _TARGET_X86_ - if (hasMultiRegRetVal) - { - assert(retTypeDesc != nullptr); - info->setDstCandidates(l, retTypeDesc->GetABIReturnRegs()); - } - else if (varTypeIsFloating(registerType)) - { -#ifdef _TARGET_X86_ - // The return value will be on the X87 stack, and we will need to move it. - info->setDstCandidates(l, l->allRegs(registerType)); -#else // !_TARGET_X86_ - info->setDstCandidates(l, RBM_FLOATRET); -#endif // !_TARGET_X86_ - } - else if (registerType == TYP_LONG) - { - info->setDstCandidates(l, RBM_LNGRET); - } - else - { - info->setDstCandidates(l, RBM_INTRET); - } - - // number of args to a call = - // callRegArgs + (callargs - placeholders, setup, etc) - // there is an explicit thisPtr but it is redundant - - // If there is an explicit this pointer, we don't want that node to produce anything - // as it is redundant - if (call->gtCallObjp != nullptr) - { - GenTreePtr thisPtrNode = call->gtCallObjp; - - if (thisPtrNode->gtOper == GT_PUTARG_REG) - { - l->clearOperandCounts(thisPtrNode); - l->clearDstCount(thisPtrNode->gtOp.gtOp1); - } - else - { - l->clearDstCount(thisPtrNode); - } - } - -#if FEATURE_VARARG - bool callHasFloatRegArgs = false; -#endif // !FEATURE_VARARG - - // First, count reg args - for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext()) - { - assert(list->OperIsList()); - - GenTreePtr argNode = list->Current(); - - fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode); - assert(curArgTabEntry); - - if (curArgTabEntry->regNum == REG_STK) - { - // late arg that is not passed in a register - DISPNODE(argNode); - assert(argNode->gtOper == GT_PUTARG_STK); - argNode->gtLsraInfo.srcCount = 1; - argNode->gtLsraInfo.dstCount = 0; - -#ifdef FEATURE_PUT_STRUCT_ARG_STK - // If the node is TYP_STRUCT and it is put on stack with - // putarg_stk operation, we consume and produce no registers. - // In this case the embedded Obj node should not produce - // registers too since it is contained. - // Note that if it is a SIMD type the argument will be in a register. - if (argNode->TypeGet() == TYP_STRUCT) - { - assert(argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_OBJ); - argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0; - argNode->gtLsraInfo.srcCount = 0; - } -#endif // FEATURE_PUT_STRUCT_ARG_STK - continue; - } - - regNumber argReg = REG_NA; - regMaskTP argMask = RBM_NONE; - short regCount = 0; - bool isOnStack = true; - if (curArgTabEntry->regNum != REG_STK) - { - isOnStack = false; - var_types argType = argNode->TypeGet(); - -#if FEATURE_VARARG - callHasFloatRegArgs |= varTypeIsFloating(argType); -#endif // !FEATURE_VARARG - - argReg = curArgTabEntry->regNum; - regCount = 1; - - // Default case is that we consume one source; modify this later (e.g. for - // promoted structs) - info->srcCount++; - - argMask = genRegMask(argReg); - argNode = argNode->gtEffectiveVal(); - } - - // If the struct arg is wrapped in CPYBLK the type of the param will be TYP_VOID. - // Use the curArgTabEntry's isStruct to get whether the param is a struct. - if (varTypeIsStruct(argNode) PUT_STRUCT_ARG_STK_ONLY(|| curArgTabEntry->isStruct)) - { - unsigned originalSize = 0; - LclVarDsc* varDsc = nullptr; - if (argNode->gtOper == GT_LCL_VAR) - { - varDsc = compiler->lvaTable + argNode->gtLclVarCommon.gtLclNum; - originalSize = varDsc->lvSize(); - } - else if (argNode->gtOper == GT_MKREFANY) - { - originalSize = 2 * TARGET_POINTER_SIZE; - } - else if (argNode->gtOper == GT_OBJ) - { - noway_assert(!"GT_OBJ not supported for amd64"); - } -#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING - else if (argNode->gtOper == GT_PUTARG_REG) - { - originalSize = genTypeSize(argNode->gtType); - } - else if (argNode->gtOper == GT_FIELD_LIST) - { - originalSize = 0; - - // There could be up to 2 PUTARG_REGs in the list - GenTreeFieldList* fieldListPtr = argNode->AsFieldList(); - unsigned iterationNum = 0; - for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest()) - { - GenTreePtr putArgRegNode = fieldListPtr->Current(); - assert(putArgRegNode->gtOper == GT_PUTARG_REG); - - if (iterationNum == 0) - { - varDsc = compiler->lvaTable + putArgRegNode->gtOp.gtOp1->gtLclVarCommon.gtLclNum; - originalSize = varDsc->lvSize(); - assert(originalSize != 0); - } - else - { - // Need an extra source for every node, but the first in the list. - info->srcCount++; - - // Get the mask for the second putarg_reg - argMask = genRegMask(curArgTabEntry->otherRegNum); - } - - putArgRegNode->gtLsraInfo.setDstCandidates(l, argMask); - putArgRegNode->gtLsraInfo.setSrcCandidates(l, argMask); - - // To avoid redundant moves, have the argument child tree computed in the - // register in which the argument is passed to the call. - putArgRegNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(putArgRegNode)); - iterationNum++; - } - - assert(iterationNum <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS); - } -#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING - else - { - noway_assert(!"Can't predict unsupported TYP_STRUCT arg kind"); - } - - unsigned slots = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES; - unsigned remainingSlots = slots; - - if (!isOnStack) - { - remainingSlots = slots - 1; - - regNumber reg = (regNumber)(argReg + 1); - while (remainingSlots > 0 && reg <= REG_ARG_LAST) - { - argMask |= genRegMask(reg); - reg = (regNumber)(reg + 1); - remainingSlots--; - regCount++; - } - } - - short internalIntCount = 0; - if (remainingSlots > 0) - { -#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING - // This TYP_STRUCT argument is also passed in the outgoing argument area - // We need a register to address the TYP_STRUCT - internalIntCount = 1; -#else // FEATURE_UNIX_AMD64_STRUCT_PASSING - // And we may need 2 - internalIntCount = 2; -#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING - } - argNode->gtLsraInfo.internalIntCount = internalIntCount; - -#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING - if (argNode->gtOper == GT_PUTARG_REG) - { - argNode->gtLsraInfo.setDstCandidates(l, argMask); - argNode->gtLsraInfo.setSrcCandidates(l, argMask); - } -#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING - } - else - { - argNode->gtLsraInfo.setDstCandidates(l, argMask); - argNode->gtLsraInfo.setSrcCandidates(l, argMask); - } - - // To avoid redundant moves, have the argument child tree computed in the - // register in which the argument is passed to the call. - if (argNode->gtOper == GT_PUTARG_REG) - { - argNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(argNode)); - } - -#if FEATURE_VARARG - // In the case of a varargs call, the ABI dictates that if we have floating point args, - // we must pass the enregistered arguments in both the integer and floating point registers. - // Since the integer register is not associated with this arg node, we will reserve it as - // an internal register so that it is not used during the evaluation of the call node - // (e.g. for the target). - if (call->IsVarargs() && varTypeIsFloating(argNode)) - { - regNumber targetReg = compiler->getCallArgIntRegister(argReg); - info->setInternalIntCount(info->internalIntCount + 1); - info->addInternalCandidates(l, genRegMask(targetReg)); - } -#endif // FEATURE_VARARG - } - - // Now, count stack args - // Note that these need to be computed into a register, but then - // they're just stored to the stack - so the reg doesn't - // need to remain live until the call. In fact, it must not - // because the code generator doesn't actually consider it live, - // so it can't be spilled. - - GenTreePtr args = call->gtCallArgs; - while (args) - { - GenTreePtr arg = args->gtOp.gtOp1; - if (!(args->gtFlags & GTF_LATE_ARG)) - { - TreeNodeInfo* argInfo = &(arg->gtLsraInfo); -#if !defined(_TARGET_64BIT_) - if (arg->TypeGet() == TYP_LONG) - { - assert(arg->OperGet() == GT_LONG); - GenTreePtr loArg = arg->gtGetOp1(); - GenTreePtr hiArg = arg->gtGetOp2(); - assert((loArg->OperGet() == GT_PUTARG_STK) && (hiArg->OperGet() == GT_PUTARG_STK)); - assert((loArg->gtLsraInfo.dstCount == 1) && (hiArg->gtLsraInfo.dstCount == 1)); - loArg->gtLsraInfo.isLocalDefUse = true; - hiArg->gtLsraInfo.isLocalDefUse = true; - } - else -#endif // !defined(_TARGET_64BIT_) - { - if (argInfo->dstCount != 0) - { - argInfo->isLocalDefUse = true; - } - - // If the child of GT_PUTARG_STK is a constant, we don't need a register to - // move it to memory (stack location). - // - // On AMD64, we don't want to make 0 contained, because we can generate smaller code - // by zeroing a register and then storing it. E.g.: - // xor rdx, rdx - // mov gword ptr [rsp+28H], rdx - // is 2 bytes smaller than: - // mov gword ptr [rsp+28H], 0 - // - // On x86, we push stack arguments; we don't use 'mov'. So: - // push 0 - // is 1 byte smaller than: - // xor rdx, rdx - // push rdx - - argInfo->dstCount = 0; - if (arg->gtOper == GT_PUTARG_STK) - { - GenTree* op1 = arg->gtOp.gtOp1; - if (IsContainableImmed(arg, op1) -#if defined(_TARGET_AMD64_) - && !op1->IsIntegralConst(0) -#endif // _TARGET_AMD64_ - ) - { - MakeSrcContained(arg, op1); - } - } - } - } - args = args->gtOp.gtOp2; - } - -#if FEATURE_VARARG - // If it is a fast tail call, it is already preferenced to use RAX. - // Therefore, no need set src candidates on call tgt again. - if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExpr != nullptr)) - { - // Don't assign the call target to any of the argument registers because - // we will use them to also pass floating point arguments as required - // by Amd64 ABI. - ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS)); - } -#endif // !FEATURE_VARARG -} - //------------------------------------------------------------------------ -// TreeNodeInfoInitBlockStore: Set the NodeInfo for a block store. +// LowerBlockStore: Set block store type // // Arguments: // blkNode - The block store node of interest @@ -1639,25 +107,15 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call) // Return Value: // None. // -void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) +void Lowering::LowerBlockStore(GenTreeBlk* blkNode) { - GenTree* dstAddr = blkNode->Addr(); - unsigned size = blkNode->gtBlkSize; - GenTree* source = blkNode->Data(); - LinearScan* l = m_lsra; - Compiler* compiler = comp; - - // Sources are dest address, initVal or source. - // We may require an additional source or temp register for the size. - blkNode->gtLsraInfo.srcCount = 2; - blkNode->gtLsraInfo.dstCount = 0; - blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE); + GenTree* dstAddr = blkNode->Addr(); + unsigned size = blkNode->gtBlkSize; + GenTree* source = blkNode->Data(); + Compiler* compiler = comp; GenTreePtr srcAddrOrFill = nullptr; bool isInitBlk = blkNode->OperIsInitBlkOp(); - regMaskTP dstAddrRegMask = RBM_NONE; - regMaskTP sourceRegMask = RBM_NONE; - regMaskTP blkSizeRegMask = RBM_NONE; if (!isInitBlk) { // CopyObj or CopyBlk @@ -1668,20 +126,6 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) if (source->gtOper == GT_IND) { srcAddrOrFill = blkNode->Data()->gtGetOp1(); - // We're effectively setting source as contained, but can't call MakeSrcContained, because the - // "inheritance" of the srcCount is to a child not a parent - it would "just work" but could be misleading. - // If srcAddr is already non-contained, we don't need to change it. - if (srcAddrOrFill->gtLsraInfo.getDstCount() == 0) - { - srcAddrOrFill->gtLsraInfo.setDstCount(1); - srcAddrOrFill->gtLsraInfo.setSrcCount(source->gtLsraInfo.srcCount); - } - m_lsra->clearOperandCounts(source); - } - else if (!source->IsMultiRegCall() && !source->OperIsSIMD()) - { - assert(source->IsLocal()); - MakeSrcContained(blkNode, source); } } @@ -1735,58 +179,18 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) initVal->gtIntCon.gtIconVal = 0x01010101 * fill; #endif // !_TARGET_AMD64_ - // In case we have a buffer >= 16 bytes - // we can use SSE2 to do a 128-bit store in a single - // instruction. - if (size >= XMM_REGSIZE_BYTES) - { - // Reserve an XMM register to fill it with - // a pack of 16 init value constants. - blkNode->gtLsraInfo.internalFloatCount = 1; - blkNode->gtLsraInfo.setInternalCandidates(l, l->internalFloatRegCandidates()); - if ((fill == 0) && ((size & 0xf) == 0)) - { - MakeSrcContained(blkNode, source); - } - } blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; - -#ifdef _TARGET_X86_ - if ((size & 1) != 0) - { - // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing - // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this - // when unrolling, so only allow byteable registers as the source value. (We could - // consider just using BlkOpKindRepInstr instead.) - sourceRegMask = RBM_BYTE_REGS; - } -#endif // _TARGET_X86_ } else { - // rep stos has the following register requirements: - // a) The memory address to be in RDI. - // b) The fill value has to be in RAX. - // c) The buffer size will go in RCX. - dstAddrRegMask = RBM_RDI; - srcAddrOrFill = initVal; - sourceRegMask = RBM_RAX; - blkSizeRegMask = RBM_RCX; blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr; } } else { #ifdef _TARGET_AMD64_ - // The helper follows the regular AMD64 ABI. - dstAddrRegMask = RBM_ARG_0; - sourceRegMask = RBM_ARG_1; - blkSizeRegMask = RBM_ARG_2; blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; #else // !_TARGET_AMD64_ - dstAddrRegMask = RBM_RDI; - sourceRegMask = RBM_RAX; - blkSizeRegMask = RBM_RCX; blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr; #endif // !_TARGET_AMD64_ } @@ -1870,19 +274,12 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) if (IsRepMovsProfitable) { // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq. - blkSizeRegMask = RBM_RCX; blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr; } else { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; } - - dstAddrRegMask = RBM_RDI; - - // The srcAddr must be in a register. If it was under a GT_IND, we need to subsume all of its - // sources. - sourceRegMask = RBM_RSI; } else { @@ -1903,119 +300,31 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) // our framework assemblies, so this is the main code generation scheme we'll use. if (size <= CPBLK_UNROLL_LIMIT) { - // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg. - // - // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte. - // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude - // RBM_NON_BYTE_REGS from internal candidates. - if ((size & (XMM_REGSIZE_BYTES - 1)) != 0) - { - blkNode->gtLsraInfo.internalIntCount++; - regMaskTP regMask = l->allRegs(TYP_INT); - -#ifdef _TARGET_X86_ - if ((size % 2) != 0) - { - regMask &= ~RBM_NON_BYTE_REGS; - } -#endif - blkNode->gtLsraInfo.setInternalCandidates(l, regMask); - } - - if (size >= XMM_REGSIZE_BYTES) - { - // If we have a buffer larger than XMM_REGSIZE_BYTES, - // reserve an XMM register to use it for a - // series of 16-byte loads and stores. - blkNode->gtLsraInfo.internalFloatCount = 1; - blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates()); - } - - // If src or dst are on stack, we don't have to generate the address into a register - // because it's just some constant+SP - if (srcAddrOrFill != nullptr && srcAddrOrFill->OperIsLocalAddr()) - { - MakeSrcContained(blkNode, srcAddrOrFill); - } - - if (dstAddr->OperIsLocalAddr()) - { - MakeSrcContained(blkNode, dstAddr); - } - blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; } else { - blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE); - dstAddrRegMask = RBM_RDI; - sourceRegMask = RBM_RSI; - blkSizeRegMask = RBM_RCX; blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr; } } #ifdef _TARGET_AMD64_ else { - // In case we have a constant integer this means we went beyond - // CPBLK_MOVS_LIMIT bytes of size, still we should never have the case of - // any GC-Pointers in the src struct. - blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE); - dstAddrRegMask = RBM_ARG_0; - sourceRegMask = RBM_ARG_1; - blkSizeRegMask = RBM_ARG_2; blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; } #elif defined(_TARGET_X86_) else { - dstAddrRegMask = RBM_RDI; - sourceRegMask = RBM_RSI; - blkSizeRegMask = RBM_RCX; blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr; } #endif // _TARGET_X86_ assert(blkNode->gtBlkOpKind != GenTreeBlk::BlkOpKindInvalid); } - if (dstAddrRegMask != RBM_NONE) - { - dstAddr->gtLsraInfo.setSrcCandidates(l, dstAddrRegMask); - } - if (sourceRegMask != RBM_NONE) - { - if (srcAddrOrFill != nullptr) - { - srcAddrOrFill->gtLsraInfo.setSrcCandidates(l, sourceRegMask); - } - else - { - // This is a local source; we'll use a temp register for its address. - blkNode->gtLsraInfo.addInternalCandidates(l, sourceRegMask); - blkNode->gtLsraInfo.internalIntCount++; - } - } - if (blkSizeRegMask != RBM_NONE) - { - if (size != 0) - { - // Reserve a temp register for the block size argument. - blkNode->gtLsraInfo.addInternalCandidates(l, blkSizeRegMask); - blkNode->gtLsraInfo.internalIntCount++; - } - else - { - // The block size argument is a third argument to GT_STORE_DYN_BLK - noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK); - blkNode->gtLsraInfo.setSrcCount(3); - GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize; - blockSize->gtLsraInfo.setSrcCandidates(l, blkSizeRegMask); - } - } } #ifdef FEATURE_PUT_STRUCT_ARG_STK //------------------------------------------------------------------------ -// TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK. +// LowerPutArgStk: Lower a GT_PUTARG_STK. // // Arguments: // tree - The node of interest @@ -2023,11 +332,8 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) // Return Value: // None. // -void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk) +void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) { - TreeNodeInfo* info = &(putArgStk->gtLsraInfo); - LinearScan* l = m_lsra; - #ifdef _TARGET_X86_ if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST) { @@ -2070,9 +376,6 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk) fieldCount++; } - info->srcCount = fieldCount; - info->dstCount = 0; - // In theory, the upper bound for the size of a field list is 8: these constructs only appear when passing the // collection of lclVars that represent the fields of a promoted struct lclVar, and we do not promote struct // lclVars with more than 4 fields. If each of these lclVars is of type long, decomposition will split the @@ -2103,9 +406,8 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk) putArgStk->gtOp1 = fieldList; } - // Now that the fields have been sorted, initialize the LSRA info. + // Now that the fields have been sorted, the kind of code we will generate. bool allFieldsAreSlots = true; - bool needsByteTemp = false; unsigned prevOffset = putArgStk->getArgSize(); for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest()) { @@ -2114,56 +416,12 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk) const unsigned fieldOffset = current->gtFieldOffset; assert(fieldType != TYP_LONG); - // For x86 we must mark all integral fields as contained or reg-optional, and handle them - // accordingly in code generation, since we may have up to 8 fields, which cannot all be in - // registers to be consumed atomically by the call. - if (varTypeIsIntegralOrI(fieldNode)) - { - if (fieldNode->OperGet() == GT_LCL_VAR) - { - LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]); - if (varDsc->lvTracked && !varDsc->lvDoNotEnregister) - { - SetRegOptional(fieldNode); - } - else - { - MakeSrcContained(putArgStk, fieldNode); - } - } - else if (fieldNode->IsIntCnsFitsInI32()) - { - MakeSrcContained(putArgStk, fieldNode); - } - else - { - // For the case where we cannot directly push the value, if we run out of registers, - // it would be better to defer computation until we are pushing the arguments rather - // than spilling, but this situation is not all that common, as most cases of promoted - // structs do not have a large number of fields, and of those most are lclVars or - // copy-propagated constants. - SetRegOptional(fieldNode); - } - } - else - { - assert(varTypeIsFloating(fieldNode)); - } - // We can treat as a slot any field that is stored at a slot boundary, where the previous // field is not in the same slot. (Note that we store the fields in reverse order.) const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4); if (!fieldIsSlot) { allFieldsAreSlots = false; - if (varTypeIsByte(fieldType)) - { - // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes - // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will - // need a byte-addressable register for the store. We will enforce this requirement on an internal - // register, which we can use to copy multiple byte values. - needsByteTemp = true; - } } if (varTypeIsGC(fieldType)) @@ -2187,35 +445,13 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk) else { putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; - // If any of the fields cannot be stored with an actual push, we may need a temporary - // register to load the value before storing it to the stack location. - info->internalIntCount = 1; - regMaskTP regMask = l->allRegs(TYP_INT); - if (needsByteTemp) - { - regMask &= ~RBM_NON_BYTE_REGS; - } - info->setInternalCandidates(l, regMask); } return; } #endif // _TARGET_X86_ -#if defined(FEATURE_SIMD) && defined(_TARGET_X86_) - // For PutArgStk of a TYP_SIMD12, we need an extra register. - if (putArgStk->TypeGet() == TYP_SIMD12) - { - info->srcCount = putArgStk->gtOp1->gtLsraInfo.dstCount; - info->dstCount = 0; - info->internalFloatCount = 1; - info->setInternalCandidates(l, l->allSIMDRegs()); - return; - } -#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_) - if (putArgStk->TypeGet() != TYP_STRUCT) { - TreeNodeInfoInitSimple(putArgStk); return; } @@ -2223,21 +459,6 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk) GenTreePtr src = putArgStk->gtOp1; GenTreePtr srcAddr = nullptr; - bool haveLocalAddr = false; - if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND)) - { - srcAddr = src->gtOp.gtOp1; - assert(srcAddr != nullptr); - haveLocalAddr = srcAddr->OperIsLocalAddr(); - } - else - { - assert(varTypeIsSIMD(putArgStk)); - } - - info->srcCount = src->gtLsraInfo.dstCount; - info->dstCount = 0; - // In case of a CpBlk we could use a helper call. In case of putarg_stk we // can't do that since the helper call could kill some already set up outgoing args. // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj. @@ -2257,38 +478,6 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk) // our framework assemblies, so this is the main code generation scheme we'll use. if (size <= CPBLK_UNROLL_LIMIT && putArgStk->gtNumberReferenceSlots == 0) { - // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg. - // - // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte. - // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude - // RBM_NON_BYTE_REGS from internal candidates. - if ((size & (XMM_REGSIZE_BYTES - 1)) != 0) - { - info->internalIntCount++; - regMaskTP regMask = l->allRegs(TYP_INT); - -#ifdef _TARGET_X86_ - if ((size % 2) != 0) - { - regMask &= ~RBM_NON_BYTE_REGS; - } -#endif - info->setInternalCandidates(l, regMask); - } - -#ifdef _TARGET_X86_ - if (size >= 8) -#else // !_TARGET_X86_ - if (size >= XMM_REGSIZE_BYTES) -#endif // !_TARGET_X86_ - { - // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux, - // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a - // series of 16-byte loads and stores. - info->internalFloatCount = 1; - info->addInternalCandidates(l, l->internalFloatRegCandidates()); - } - #ifdef _TARGET_X86_ if (size < XMM_REGSIZE_BYTES) { @@ -2310,1486 +499,11 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk) #endif // _TARGET_X86_ else { - info->internalIntCount += 3; - info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI)); - putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr; } - - // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree. - MakeSrcContained(putArgStk, src); - - if (haveLocalAddr) - { - // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary - // copies. - // - // To avoid an assertion in MakeSrcContained, increment the parent's source count beforehand and decrement it - // afterwards. - info->srcCount++; - MakeSrcContained(putArgStk, srcAddr); - info->srcCount--; - } } #endif // FEATURE_PUT_STRUCT_ARG_STK -//------------------------------------------------------------------------ -// TreeNodeInfoInitLclHeap: Set the NodeInfo for a GT_LCLHEAP. -// -// Arguments: -// tree - The node of interest -// -// Return Value: -// None. -// -void Lowering::TreeNodeInfoInitLclHeap(GenTree* tree) -{ - TreeNodeInfo* info = &(tree->gtLsraInfo); - LinearScan* l = m_lsra; - Compiler* compiler = comp; - - info->srcCount = 1; - info->dstCount = 1; - - // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp): - // Here '-' means don't care. - // - // Size? Init Memory? # temp regs - // 0 - 0 (returns 0) - // const and <=6 reg words - 0 (pushes '0') - // const and >6 reg words Yes 0 (pushes '0') - // const and <PageSize No 0 (amd64) 1 (x86) - // (x86:tmpReg for sutracting from esp) - // const and >=PageSize No 2 (regCnt and tmpReg for subtracing from sp) - // Non-const Yes 0 (regCnt=targetReg and pushes '0') - // Non-const No 2 (regCnt and tmpReg for subtracting from sp) - // - // Note: Here we don't need internal register to be different from targetReg. - // Rather, require it to be different from operand's reg. - - GenTreePtr size = tree->gtOp.gtOp1; - if (size->IsCnsIntOrI()) - { - MakeSrcContained(tree, size); - - size_t sizeVal = size->gtIntCon.gtIconVal; - - if (sizeVal == 0) - { - info->internalIntCount = 0; - } - else - { - // Compute the amount of memory to properly STACK_ALIGN. - // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size. - // This should also help in debugging as we can examine the original size specified with localloc. - sizeVal = AlignUp(sizeVal, STACK_ALIGN); - - // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc) - // we will generate 'push 0'. - assert((sizeVal % REGSIZE_BYTES) == 0); - size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES; - if (cntRegSizedWords <= 6) - { - info->internalIntCount = 0; - } - else if (!compiler->info.compInitMem) - { - // No need to initialize allocated stack space. - if (sizeVal < compiler->eeGetPageSize()) - { -#ifdef _TARGET_X86_ - info->internalIntCount = 1; // x86 needs a register here to avoid generating "sub" on ESP. -#else // !_TARGET_X86_ - info->internalIntCount = 0; -#endif // !_TARGET_X86_ - } - else - { - // We need two registers: regCnt and RegTmp - info->internalIntCount = 2; - } - } - else - { - // >6 and need to zero initialize allocated stack space. - info->internalIntCount = 0; - } - } - } - else - { - if (!compiler->info.compInitMem) - { - info->internalIntCount = 2; - } - else - { - info->internalIntCount = 0; - } - } -} - -//------------------------------------------------------------------------ -// TreeNodeInfoInitLogicalOp: Set the NodeInfo for GT_AND/GT_OR/GT_XOR, -// as well as GT_ADD/GT_SUB. -// -// Arguments: -// tree - The node of interest -// -// Return Value: -// None. -// -void Lowering::TreeNodeInfoInitLogicalOp(GenTree* tree) -{ - TreeNodeInfo* info = &(tree->gtLsraInfo); - LinearScan* l = m_lsra; - - // We're not marking a constant hanging on the left of the add - // as containable so we assign it to a register having CQ impact. - // TODO-XArch-CQ: Detect this case and support both generating a single instruction - // for GT_ADD(Constant, SomeTree) - info->srcCount = 2; - info->dstCount = 1; - - GenTree* op1 = tree->gtGetOp1(); - GenTree* op2 = tree->gtGetOp2(); - - // We can directly encode the second operand if it is either a containable constant or a memory-op. - // In case of memory-op, we can encode it directly provided its type matches with 'tree' type. - // This is because during codegen, type of 'tree' is used to determine emit Type size. If the types - // do not match, they get normalized (i.e. sign/zero extended) on load into a register. - bool directlyEncodable = false; - bool binOpInRMW = false; - GenTreePtr operand = nullptr; - - if (IsContainableImmed(tree, op2)) - { - directlyEncodable = true; - operand = op2; - } - else - { - binOpInRMW = IsBinOpInRMWStoreInd(tree); - if (!binOpInRMW) - { - if (op2->isMemoryOp() && tree->TypeGet() == op2->TypeGet()) - { - directlyEncodable = true; - operand = op2; - } - else if (tree->OperIsCommutative()) - { - if (IsContainableImmed(tree, op1) || - (op1->isMemoryOp() && tree->TypeGet() == op1->TypeGet() && IsSafeToContainMem(tree, op1))) - { - // If it is safe, we can reverse the order of operands of commutative operations for efficient - // codegen - directlyEncodable = true; - operand = op1; - } - } - } - } - - if (directlyEncodable) - { - assert(operand != nullptr); - MakeSrcContained(tree, operand); - } - else if (!binOpInRMW) - { - // If this binary op neither has contained operands, nor is a - // Read-Modify-Write (RMW) operation, we can mark its operands - // as reg optional. - SetRegOptionalForBinOp(tree); - } - - // Codegen of this tree node sets ZF and SF flags. - tree->gtFlags |= GTF_ZSF_SET; -} - -//------------------------------------------------------------------------ -// TreeNodeInfoInitModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV. -// -// Arguments: -// tree - The node of interest -// -// Return Value: -// None. -// -void Lowering::TreeNodeInfoInitModDiv(GenTree* tree) -{ - TreeNodeInfo* info = &(tree->gtLsraInfo); - LinearScan* l = m_lsra; - - GenTree* op1 = tree->gtGetOp1(); - GenTree* op2 = tree->gtGetOp2(); - - info->srcCount = 2; - info->dstCount = 1; - - switch (tree->OperGet()) - { - case GT_MOD: - case GT_DIV: - if (varTypeIsFloating(tree->TypeGet())) - { - // No implicit conversions at this stage as the expectation is that - // everything is made explicit by adding casts. - assert(op1->TypeGet() == op2->TypeGet()); - - if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl()) - { - MakeSrcContained(tree, op2); - } - else - { - // If there are no containable operands, we can make an operand reg optional. - // SSE2 allows only op2 to be a memory-op. - SetRegOptional(op2); - } - - return; - } - break; - - default: - break; - } - - // Amd64 Div/Idiv instruction: - // Dividend in RAX:RDX and computes - // Quotient in RAX, Remainder in RDX - - if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD) - { - // We are interested in just the remainder. - // RAX is used as a trashable register during computation of remainder. - info->setDstCandidates(l, RBM_RDX); - } - else - { - // We are interested in just the quotient. - // RDX gets used as trashable register during computation of quotient - info->setDstCandidates(l, RBM_RAX); - } - - bool op2CanBeRegOptional = true; -#ifdef _TARGET_X86_ - if (op1->OperGet() == GT_LONG) - { - // To avoid reg move would like to have op1's low part in RAX and high part in RDX. - GenTree* loVal = op1->gtGetOp1(); - GenTree* hiVal = op1->gtGetOp2(); - - // Src count is actually 3, so increment. - assert(op2->IsCnsIntOrI()); - assert(tree->OperGet() == GT_UMOD); - info->srcCount++; - op2CanBeRegOptional = false; - - // This situation also requires an internal register. - info->internalIntCount = 1; - info->setInternalCandidates(l, l->allRegs(TYP_INT)); - - loVal->gtLsraInfo.setSrcCandidates(l, RBM_EAX); - hiVal->gtLsraInfo.setSrcCandidates(l, RBM_EDX); - } - else -#endif - { - // If possible would like to have op1 in RAX to avoid a register move - op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX); - } - - // divisor can be an r/m, but the memory indirection must be of the same size as the divide - if (op2->isMemoryOp() && (op2->TypeGet() == tree->TypeGet())) - { - MakeSrcContained(tree, op2); - } - else if (op2CanBeRegOptional) - { - op2->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX)); - - // If there are no containable operands, we can make an operand reg optional. - // Div instruction allows only op2 to be a memory op. - SetRegOptional(op2); - } -} - -//------------------------------------------------------------------------ -// TreeNodeInfoInitIntrinsic: Set the NodeInfo for a GT_INTRINSIC. -// -// Arguments: -// tree - The node of interest -// -// Return Value: -// None. -// -void Lowering::TreeNodeInfoInitIntrinsic(GenTree* tree) -{ - TreeNodeInfo* info = &(tree->gtLsraInfo); - LinearScan* l = m_lsra; - - // Both operand and its result must be of floating point type. - GenTree* op1 = tree->gtGetOp1(); - assert(varTypeIsFloating(op1)); - assert(op1->TypeGet() == tree->TypeGet()); - - info->srcCount = 1; - info->dstCount = 1; - - switch (tree->gtIntrinsic.gtIntrinsicId) - { - case CORINFO_INTRINSIC_Sqrt: - if (op1->isMemoryOp() || op1->IsCnsNonZeroFltOrDbl()) - { - MakeSrcContained(tree, op1); - } - else - { - // Mark the operand as reg optional since codegen can still - // generate code if op1 is on stack. - SetRegOptional(op1); - } - break; - - case CORINFO_INTRINSIC_Abs: - // Abs(float x) = x & 0x7fffffff - // Abs(double x) = x & 0x7ffffff ffffffff - - // In case of Abs we need an internal register to hold mask. - - // TODO-XArch-CQ: avoid using an internal register for the mask. - // Andps or andpd both will operate on 128-bit operands. - // The data section constant to hold the mask is a 64-bit size. - // Therefore, we need both the operand and mask to be in - // xmm register. When we add support in emitter to emit 128-bit - // data constants and instructions that operate on 128-bit - // memory operands we can avoid the need for an internal register. - if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs) - { - info->internalFloatCount = 1; - info->setInternalCandidates(l, l->internalFloatRegCandidates()); - } - break; - -#ifdef _TARGET_X86_ - case CORINFO_INTRINSIC_Cos: - case CORINFO_INTRINSIC_Sin: - case CORINFO_INTRINSIC_Round: - NYI_X86("Math intrinsics Cos, Sin and Round"); - break; -#endif // _TARGET_X86_ - - default: - // Right now only Sqrt/Abs are treated as math intrinsics - noway_assert(!"Unsupported math intrinsic"); - unreached(); - break; - } -} - -#ifdef FEATURE_SIMD -//------------------------------------------------------------------------ -// TreeNodeInfoInitSIMD: Set the NodeInfo for a GT_SIMD tree. -// -// Arguments: -// tree - The GT_SIMD node of interest -// -// Return Value: -// None. - -void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) -{ - GenTreeSIMD* simdTree = tree->AsSIMD(); - TreeNodeInfo* info = &(tree->gtLsraInfo); - LinearScan* lsra = m_lsra; - info->dstCount = 1; - switch (simdTree->gtSIMDIntrinsicID) - { - GenTree* op1; - GenTree* op2; - - case SIMDIntrinsicInit: - { - info->srcCount = 1; - op1 = tree->gtOp.gtOp1; - - // This sets all fields of a SIMD struct to the given value. - // Mark op1 as contained if it is either zero or int constant of all 1's, - // or a float constant with 16 or 32 byte simdType (AVX case) - // - // Should never see small int base type vectors except for zero initialization. - assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0)); - - if (op1->IsFPZero() || op1->IsIntegralConst(0) || - (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1))) - { - MakeSrcContained(tree, tree->gtOp.gtOp1); - info->srcCount = 0; - } - else if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) && - ((simdTree->gtSIMDSize == 16) || (simdTree->gtSIMDSize == 32))) - { - // Either op1 is a float or dbl constant or an addr - if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr()) - { - MakeSrcContained(tree, tree->gtOp.gtOp1); - info->srcCount = 0; - } - } - } - break; - - case SIMDIntrinsicInitN: - { - info->srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(simdTree->gtSIMDBaseType)); - - // Need an internal register to stitch together all the values into a single vector in a SIMD reg. - info->internalFloatCount = 1; - info->setInternalCandidates(lsra, lsra->allSIMDRegs()); - } - break; - - case SIMDIntrinsicInitArray: - // We have an array and an index, which may be contained. - info->srcCount = 2; - CheckImmedAndMakeContained(tree, tree->gtGetOp2()); - break; - - case SIMDIntrinsicDiv: - // SSE2 has no instruction support for division on integer vectors - noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType)); - info->srcCount = 2; - break; - - case SIMDIntrinsicAbs: - // This gets implemented as bitwise-And operation with a mask - // and hence should never see it here. - unreached(); - break; - - case SIMDIntrinsicSqrt: - // SSE2 has no instruction support for sqrt on integer vectors. - noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType)); - info->srcCount = 1; - break; - - case SIMDIntrinsicAdd: - case SIMDIntrinsicSub: - case SIMDIntrinsicMul: - case SIMDIntrinsicBitwiseAnd: - case SIMDIntrinsicBitwiseAndNot: - case SIMDIntrinsicBitwiseOr: - case SIMDIntrinsicBitwiseXor: - case SIMDIntrinsicMin: - case SIMDIntrinsicMax: - info->srcCount = 2; - - // SSE2 32-bit integer multiplication requires two temp regs - if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT && - comp->getSIMDInstructionSet() == InstructionSet_SSE2) - { - info->internalFloatCount = 2; - info->setInternalCandidates(lsra, lsra->allSIMDRegs()); - } - break; - - case SIMDIntrinsicEqual: - info->srcCount = 2; - break; - - // SSE2 doesn't support < and <= directly on int vectors. - // Instead we need to use > and >= with swapped operands. - case SIMDIntrinsicLessThan: - case SIMDIntrinsicLessThanOrEqual: - info->srcCount = 2; - noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType)); - break; - - // SIMDIntrinsicEqual is supported only on non-floating point base type vectors. - // SSE2 cmpps/pd doesn't support > and >= directly on float/double vectors. - // Instead we need to use < and <= with swapped operands. - case SIMDIntrinsicGreaterThan: - noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType)); - info->srcCount = 2; - break; - - case SIMDIntrinsicOpEquality: - case SIMDIntrinsicOpInEquality: - info->srcCount = 2; - - // On SSE4/AVX, we can generate optimal code for (in)equality - // against zero using ptest. We can safely do the this optimization - // for integral vectors but not for floating-point for the reason - // that we have +0.0 and -0.0 and +0.0 == -0.0 - op2 = tree->gtGetOp2(); - if ((comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0)) - { - MakeSrcContained(tree, op2); - } - else - { - - // Need one SIMD register as scratch. - // See genSIMDIntrinsicRelOp() for details on code sequence generated and - // the need for one scratch register. - // - // Note these intrinsics produce a BOOL result, hence internal float - // registers reserved are guaranteed to be different from target - // integer register without explicitly specifying. - info->internalFloatCount = 1; - info->setInternalCandidates(lsra, lsra->allSIMDRegs()); - } - break; - - case SIMDIntrinsicDotProduct: - // Float/Double vectors: - // For SSE, or AVX with 32-byte vectors, we also need an internal register - // as scratch. Further we need the targetReg and internal reg to be distinct - // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we - // don't need a tmpReg. - // - // 32-byte integer vector on SSE4/AVX: - // will take advantage of phaddd, which operates only on 128-bit xmm reg. - // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal - // registers since targetReg is an int type register. - // - // See genSIMDIntrinsicDotProduct() for details on code sequence generated - // and the need for scratch registers. - if (varTypeIsFloating(simdTree->gtSIMDBaseType)) - { - if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) || - (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32)) - { - info->internalFloatCount = 1; - info->isInternalRegDelayFree = true; - info->setInternalCandidates(lsra, lsra->allSIMDRegs()); - } - // else don't need scratch reg(s). - } - else - { - assert(simdTree->gtSIMDBaseType == TYP_INT && comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4); - - // No need to set isInternalRegDelayFree since targetReg is a - // an int type reg and guaranteed to be different from xmm/ymm - // regs. - info->internalFloatCount = comp->canUseAVX() ? 2 : 1; - info->setInternalCandidates(lsra, lsra->allSIMDRegs()); - } - info->srcCount = 2; - break; - - case SIMDIntrinsicGetItem: - { - // This implements get_Item method. The sources are: - // - the source SIMD struct - // - index (which element to get) - // The result is baseType of SIMD struct. - info->srcCount = 2; - op1 = tree->gtOp.gtOp1; - op2 = tree->gtOp.gtOp2; - - // If the index is a constant, mark it as contained. - if (CheckImmedAndMakeContained(tree, op2)) - { - info->srcCount = 1; - } - - if (op1->isMemoryOp()) - { - MakeSrcContained(tree, op1); - - // Although GT_IND of TYP_SIMD12 reserves an internal float - // register for reading 4 and 8 bytes from memory and - // assembling them into target XMM reg, it is not required - // in this case. - op1->gtLsraInfo.internalIntCount = 0; - op1->gtLsraInfo.internalFloatCount = 0; - } - else - { - // If the index is not a constant, we will use the SIMD temp location to store the vector. - // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we - // can use that in the process of extracting the element. - // - // If the index is a constant and base type is a small int we can use pextrw, but on AVX - // we will need a temp if are indexing into the upper half of the AVX register. - // In all other cases with constant index, we need a temp xmm register to extract the - // element if index is other than zero. - - if (!op2->IsCnsIntOrI()) - { - (void)comp->getSIMDInitTempVarNum(); - } - else if (!varTypeIsFloating(simdTree->gtSIMDBaseType)) - { - bool needFloatTemp; - if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) && - (comp->getSIMDInstructionSet() == InstructionSet_AVX)) - { - int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType); - needFloatTemp = (byteShiftCnt >= 16); - } - else - { - needFloatTemp = !op2->IsIntegralConst(0); - } - - if (needFloatTemp) - { - info->internalFloatCount = 1; - info->setInternalCandidates(lsra, lsra->allSIMDRegs()); - } - } - } - } - break; - - case SIMDIntrinsicSetX: - case SIMDIntrinsicSetY: - case SIMDIntrinsicSetZ: - case SIMDIntrinsicSetW: - info->srcCount = 2; - - // We need an internal integer register for SSE2 codegen - if (comp->getSIMDInstructionSet() == InstructionSet_SSE2) - { - info->internalIntCount = 1; - info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT)); - } - - break; - - case SIMDIntrinsicCast: - info->srcCount = 1; - break; - - case SIMDIntrinsicShuffleSSE2: - info->srcCount = 2; - // Second operand is an integer constant and marked as contained. - op2 = tree->gtOp.gtOp2; - noway_assert(op2->IsCnsIntOrI()); - MakeSrcContained(tree, op2); - break; - - case SIMDIntrinsicGetX: - case SIMDIntrinsicGetY: - case SIMDIntrinsicGetZ: - case SIMDIntrinsicGetW: - case SIMDIntrinsicGetOne: - case SIMDIntrinsicGetZero: - case SIMDIntrinsicGetCount: - case SIMDIntrinsicGetAllOnes: - assert(!"Get intrinsics should not be seen during Lowering."); - unreached(); - - default: - noway_assert(!"Unimplemented SIMD node type."); - unreached(); - } -} -#endif // FEATURE_SIMD - -//------------------------------------------------------------------------ -// TreeNodeInfoInitCast: Set the NodeInfo for a GT_CAST. -// -// Arguments: -// tree - The node of interest -// -// Return Value: -// None. -// -void Lowering::TreeNodeInfoInitCast(GenTree* tree) -{ - TreeNodeInfo* info = &(tree->gtLsraInfo); - - // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register. - // see CodeGen::genIntToIntCast() - - info->srcCount = 1; - info->dstCount = 1; - - // Non-overflow casts to/from float/double are done using SSE2 instructions - // and that allow the source operand to be either a reg or memop. Given the - // fact that casts from small int to float/double are done as two-level casts, - // the source operand is always guaranteed to be of size 4 or 8 bytes. - var_types castToType = tree->CastToType(); - GenTreePtr castOp = tree->gtCast.CastOp(); - var_types castOpType = castOp->TypeGet(); - if (tree->gtFlags & GTF_UNSIGNED) - { - castOpType = genUnsignedType(castOpType); - } - - if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType))) - { -#ifdef DEBUG - // If converting to float/double, the operand must be 4 or 8 byte in size. - if (varTypeIsFloating(castToType)) - { - unsigned opSize = genTypeSize(castOpType); - assert(opSize == 4 || opSize == 8); - } -#endif // DEBUG - - // U8 -> R8 conversion requires that the operand be in a register. - if (castOpType != TYP_ULONG) - { - if (castOp->isMemoryOp() || castOp->IsCnsNonZeroFltOrDbl()) - { - MakeSrcContained(tree, castOp); - } - else - { - // Mark castOp as reg optional to indicate codegen - // can still generate code if it is on stack. - SetRegOptional(castOp); - } - } - } - -#if !defined(_TARGET_64BIT_) - if (varTypeIsLong(castOpType)) - { - noway_assert(castOp->OperGet() == GT_LONG); - info->srcCount = 2; - } -#endif // !defined(_TARGET_64BIT_) - - // some overflow checks need a temp reg: - // - GT_CAST from INT64/UINT64 to UINT32 - if (tree->gtOverflow() && (castToType == TYP_UINT)) - { - if (genTypeSize(castOpType) == 8) - { - // Here we don't need internal register to be different from targetReg, - // rather require it to be different from operand's reg. - info->internalIntCount = 1; - } - } -} - -void Lowering::LowerGCWriteBarrier(GenTree* tree) -{ - assert(tree->OperGet() == GT_STOREIND); - - GenTreeStoreInd* dst = tree->AsStoreInd(); - GenTreePtr addr = dst->Addr(); - GenTreePtr src = dst->Data(); - - if (addr->OperGet() == GT_LEA) - { - // In the case where we are doing a helper assignment, if the dst - // is an indir through an lea, we need to actually instantiate the - // lea in a register - GenTreeAddrMode* lea = addr->AsAddrMode(); - - int leaSrcCount = 0; - if (lea->HasBase()) - { - leaSrcCount++; - } - if (lea->HasIndex()) - { - leaSrcCount++; - } - lea->gtLsraInfo.srcCount = leaSrcCount; - lea->gtLsraInfo.dstCount = 1; - } - - bool useOptimizedWriteBarrierHelper = false; // By default, assume no optimized write barriers. - -#if NOGC_WRITE_BARRIERS - -#if defined(_TARGET_X86_) - - useOptimizedWriteBarrierHelper = true; // On x86, use the optimized write barriers by default. -#ifdef DEBUG - GCInfo::WriteBarrierForm wbf = comp->codeGen->gcInfo.gcIsWriteBarrierCandidate(tree, src); - if (wbf == GCInfo::WBF_NoBarrier_CheckNotHeapInDebug) // This one is always a call to a C++ method. - { - useOptimizedWriteBarrierHelper = false; - } -#endif - - if (useOptimizedWriteBarrierHelper) - { - // Special write barrier: - // op1 (addr) goes into REG_WRITE_BARRIER (rdx) and - // op2 (src) goes into any int register. - addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER); - src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER_SRC); - } - -#else // !defined(_TARGET_X86_) -#error "NOGC_WRITE_BARRIERS is not supported" -#endif // !defined(_TARGET_X86_) - -#endif // NOGC_WRITE_BARRIERS - - if (!useOptimizedWriteBarrierHelper) - { - // For the standard JIT Helper calls: - // op1 (addr) goes into REG_ARG_0 and - // op2 (src) goes into REG_ARG_1 - addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0); - src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1); - } - - // Both src and dst must reside in a register, which they should since we haven't set - // either of them as contained. - assert(addr->gtLsraInfo.dstCount == 1); - assert(src->gtLsraInfo.dstCount == 1); -} - -//----------------------------------------------------------------------------------------- -// Specify register requirements for address expression of an indirection operation. -// -// Arguments: -// indirTree - GT_IND or GT_STOREIND gentree node -// -void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree) -{ - assert(indirTree->isIndir()); - // If this is the rhs of a block copy (i.e. non-enregisterable struct), - // it has no register requirements. - if (indirTree->TypeGet() == TYP_STRUCT) - { - return; - } - - GenTreePtr addr = indirTree->gtGetOp1(); - TreeNodeInfo* info = &(indirTree->gtLsraInfo); - - GenTreePtr base = nullptr; - GenTreePtr index = nullptr; - unsigned mul, cns; - bool rev; - -#ifdef FEATURE_SIMD - // If indirTree is of TYP_SIMD12, don't mark addr as contained - // so that it always get computed to a register. This would - // mean codegen side logic doesn't need to handle all possible - // addr expressions that could be contained. - // - // TODO-XArch-CQ: handle other addr mode expressions that could be marked - // as contained. - if (indirTree->TypeGet() == TYP_SIMD12) - { - // Vector3 is read/written as two reads/writes: 8 byte and 4 byte. - // To assemble the vector properly we would need an additional - // XMM register. - info->internalFloatCount = 1; - - // In case of GT_IND we need an internal register different from targetReg and - // both of the registers are used at the same time. - if (indirTree->OperGet() == GT_IND) - { - info->isInternalRegDelayFree = true; - } - - info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs()); - - return; - } -#endif // FEATURE_SIMD - - if ((indirTree->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0) - { - // The address of an indirection that requires its address in a reg. - // Skip any further processing that might otherwise make it contained. - } - else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR)) - { - // These nodes go into an addr mode: - // - GT_CLS_VAR_ADDR turns into a constant. - // - GT_LCL_VAR_ADDR is a stack addr mode. - - // make this contained, it turns into a constant that goes into an addr mode - MakeSrcContained(indirTree, addr); - } - else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp)) - { - // Amd64: - // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address. - // (i.e. those VSD calls for which stub addr is known during JIT compilation time). In this case, - // VM requires us to pass stub addr in REG_VIRTUAL_STUB_PARAM - see LowerVirtualStubCall(). For - // that reason we cannot mark such an addr as contained. Note that this is not an issue for - // indirect VSD calls since morphArgs() is explicitly materializing hidden param as a non-standard - // argument. - // - // Workaround: - // Note that LowerVirtualStubCall() sets addr->gtRegNum to REG_VIRTUAL_STUB_PARAM and Lowering::doPhase() - // sets destination candidates on such nodes and resets addr->gtRegNum to REG_NA before calling - // TreeNodeInfoInit(). Ideally we should set a flag on addr nodes that shouldn't be marked as contained - // (in LowerVirtualStubCall()), but we don't have any GTF_* flags left for that purpose. As a workaround - // an explicit check is made here. - // - // On x86, direct VSD is done via a relative branch, and in fact it MUST be contained. - MakeSrcContained(indirTree, addr); - } - else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr)) - { - MakeSrcContained(indirTree, addr); - } - else if (comp->codeGen->genCreateAddrMode(addr, -1, true, 0, &rev, &base, &index, &mul, &cns, true /*nogen*/) && - !AreSourcesPossiblyModifiedLocals(indirTree, base, index)) - { - // An addressing mode will be constructed that may cause some - // nodes to not need a register, and cause others' lifetimes to be extended - // to the GT_IND or even its parent if it's an assignment - - assert(base != addr); - m_lsra->clearOperandCounts(addr); - - const bool hasBase = base != nullptr; - const bool hasIndex = index != nullptr; - assert(hasBase || hasIndex); // At least one of a base or an index must be present. - - // If the addressing mode has both a base and an index, bump its source count by one. If it only has one or the - // other, its source count is already correct (due to the source for the address itself). - if (hasBase && hasIndex) - { - info->srcCount++; - } - - // Traverse the computation below GT_IND to find the operands - // for the addressing mode, marking the various constants and - // intermediate results as not consuming/producing. - // If the traversal were more complex, we might consider using - // a traversal function, but the addressing mode is only made - // up of simple arithmetic operators, and the code generator - // only traverses one leg of each node. - - bool foundBase = !hasBase; - bool foundIndex = !hasIndex; - for (GenTree *child = addr, *nextChild = nullptr; child != nullptr && !child->OperIsLeaf(); child = nextChild) - { - nextChild = nullptr; - GenTree* op1 = child->gtOp.gtOp1; - GenTree* op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr; - - if (op1 == base) - { - foundBase = true; - } - else if (op1 == index) - { - foundIndex = true; - } - else - { - m_lsra->clearOperandCounts(op1); - if (!op1->OperIsLeaf()) - { - nextChild = op1; - } - } - - if (op2 != nullptr) - { - if (op2 == base) - { - foundBase = true; - } - else if (op2 == index) - { - foundIndex = true; - } - else - { - m_lsra->clearOperandCounts(op2); - if (!op2->OperIsLeaf()) - { - assert(nextChild == nullptr); - nextChild = op2; - } - } - } - } - assert(foundBase && foundIndex); - } - else if (addr->gtOper == GT_ARR_ELEM) - { - // The GT_ARR_ELEM consumes all the indices and produces the offset. - // The array object lives until the mem access. - // We also consume the target register to which the address is - // computed - - info->srcCount++; - assert(addr->gtLsraInfo.srcCount >= 2); - addr->gtLsraInfo.srcCount -= 1; - } -} - -void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree) -{ - assert(tree->OperIsCompare()); - - TreeNodeInfo* info = &(tree->gtLsraInfo); - - info->srcCount = 2; - info->dstCount = 1; - -#ifdef _TARGET_X86_ - // If the compare is used by a jump, we just need to set the condition codes. If not, then we need - // to store the result into the low byte of a register, which requires the dst be a byteable register. - // We always set the dst candidates, though, because if this is compare is consumed by a jump, they - // won't be used. We might be able to use GTF_RELOP_JMP_USED to determine this case, but it's not clear - // that flag is maintained until this location (especially for decomposed long compares). - info->setDstCandidates(m_lsra, RBM_BYTE_REGS); -#endif // _TARGET_X86_ - - GenTreePtr op1 = tree->gtOp.gtOp1; - GenTreePtr op2 = tree->gtOp.gtOp2; - var_types op1Type = op1->TypeGet(); - var_types op2Type = op2->TypeGet(); - -#if !defined(_TARGET_64BIT_) - // Long compares will consume GT_LONG nodes, each of which produces two results. - // Thus for each long operand there will be an additional source. - // TODO-X86-CQ: Mark hiOp2 and loOp2 as contained if it is a constant or a memory op. - if (varTypeIsLong(op1Type)) - { - info->srcCount++; - } - if (varTypeIsLong(op2Type)) - { - info->srcCount++; - } -#endif // !defined(_TARGET_64BIT_) - - // If either of op1 or op2 is floating point values, then we need to use - // ucomiss or ucomisd to compare, both of which support the following form: - // ucomis[s|d] xmm, xmm/mem - // That is only the second operand can be a memory op. - // - // Second operand is a memory Op: Note that depending on comparison operator, - // the operands of ucomis[s|d] need to be reversed. Therefore, either op1 or - // op2 can be a memory op depending on the comparison operator. - if (varTypeIsFloating(op1Type)) - { - // The type of the operands has to be the same and no implicit conversions at this stage. - assert(op1Type == op2Type); - - bool reverseOps; - if ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0) - { - // Unordered comparison case - reverseOps = (tree->gtOper == GT_GT || tree->gtOper == GT_GE); - } - else - { - reverseOps = (tree->gtOper == GT_LT || tree->gtOper == GT_LE); - } - - GenTreePtr otherOp; - if (reverseOps) - { - otherOp = op1; - } - else - { - otherOp = op2; - } - - assert(otherOp != nullptr); - if (otherOp->IsCnsNonZeroFltOrDbl()) - { - MakeSrcContained(tree, otherOp); - } - else if (otherOp->isMemoryOp() && ((otherOp == op2) || IsSafeToContainMem(tree, otherOp))) - { - MakeSrcContained(tree, otherOp); - } - else - { - // SSE2 allows only otherOp to be a memory-op. Since otherOp is not - // contained, we can mark it reg-optional. - SetRegOptional(otherOp); - } - - return; - } - - // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here - // or in other backend. - - bool hasShortCast = false; - if (CheckImmedAndMakeContained(tree, op2)) - { - // If the types are the same, or if the constant is of the correct size, - // we can treat the isMemoryOp as contained. - bool op1CanBeContained = (genTypeSize(op1Type) == genTypeSize(op2Type)); - - // Do we have a short compare against a constant in op2 - // - if (varTypeIsSmall(op1Type)) - { - GenTreeIntCon* con = op2->AsIntCon(); - ssize_t ival = con->gtIconVal; - - bool isEqualityCompare = (tree->gtOper == GT_EQ || tree->gtOper == GT_NE); - bool useTest = isEqualityCompare && (ival == 0); - - if (!useTest) - { - ssize_t lo = 0; // minimum imm value allowed for cmp reg,imm - ssize_t hi = 0; // maximum imm value allowed for cmp reg,imm - bool isUnsigned = false; - - switch (op1Type) - { - case TYP_BOOL: - op1Type = TYP_UBYTE; - __fallthrough; - case TYP_UBYTE: - lo = 0; - hi = 0x7f; - isUnsigned = true; - break; - case TYP_BYTE: - lo = -0x80; - hi = 0x7f; - break; - case TYP_CHAR: - lo = 0; - hi = 0x7fff; - isUnsigned = true; - break; - case TYP_SHORT: - lo = -0x8000; - hi = 0x7fff; - break; - default: - unreached(); - } - - if ((ival >= lo) && (ival <= hi)) - { - // We can perform a small compare with the immediate 'ival' - tree->gtFlags |= GTF_RELOP_SMALL; - if (isUnsigned && !isEqualityCompare) - { - tree->gtFlags |= GTF_UNSIGNED; - } - // We can treat the isMemoryOp as "contained" - op1CanBeContained = true; - } - } - } - - if (op1CanBeContained) - { - if (op1->isMemoryOp()) - { - MakeSrcContained(tree, op1); - } - else - { - bool op1IsMadeContained = false; - - // When op1 is a GT_AND we can often generate a single "test" instruction - // instead of two instructions (an "and" instruction followed by a "cmp"/"test"). - // - // This instruction can only be used for equality or inequality comparisons. - // and we must have a compare against zero. - // - // If we have a postive test for a single bit we can reverse the condition and - // make the compare be against zero. - // - // Example: - // GT_EQ GT_NE - // / \ / \ - // GT_AND GT_CNS (0x100) ==>> GT_AND GT_CNS (0) - // / \ / \ - // andOp1 GT_CNS (0x100) andOp1 GT_CNS (0x100) - // - // We will mark the GT_AND node as contained if the tree is an equality compare with zero. - // Additionally, when we do this we also allow for a contained memory operand for "andOp1". - // - bool isEqualityCompare = (tree->gtOper == GT_EQ || tree->gtOper == GT_NE); - - if (isEqualityCompare && (op1->OperGet() == GT_AND)) - { - GenTreePtr andOp2 = op1->gtOp.gtOp2; - if (IsContainableImmed(op1, andOp2)) - { - ssize_t andOp2CnsVal = andOp2->AsIntConCommon()->IconValue(); - ssize_t relOp2CnsVal = op2->AsIntConCommon()->IconValue(); - - if ((relOp2CnsVal == andOp2CnsVal) && isPow2(andOp2CnsVal)) - { - // We have a single bit test, so now we can change the - // tree into the alternative form, - // so that we can generate a test instruction. - - // Reverse the equality comparison - tree->SetOperRaw((tree->gtOper == GT_EQ) ? GT_NE : GT_EQ); - - // Change the relOp2CnsVal to zero - relOp2CnsVal = 0; - op2->AsIntConCommon()->SetIconValue(0); - } - - // Now do we have a equality compare with zero? - // - if (relOp2CnsVal == 0) - { - // Note that child nodes must be made contained before parent nodes - - // Check for a memory operand for op1 with the test instruction - // - GenTreePtr andOp1 = op1->gtOp.gtOp1; - if (andOp1->isMemoryOp()) - { - // If the type of value memoryOp (andOp1) is not the same as the type of constant - // (andOp2) check to see whether it is safe to mark AndOp1 as contained. For e.g. in - // the following case it is not safe to mark andOp1 as contained - // AndOp1 = signed byte and andOp2 is an int constant of value 512. - // - // If it is safe, we update the type and value of andOp2 to match with andOp1. - bool containable = (andOp1->TypeGet() == op1->TypeGet()); - if (!containable) - { - ssize_t newIconVal = 0; - - switch (andOp1->TypeGet()) - { - default: - break; - case TYP_BYTE: - newIconVal = (signed char)andOp2CnsVal; - containable = FitsIn<signed char>(andOp2CnsVal); - break; - case TYP_BOOL: - case TYP_UBYTE: - newIconVal = andOp2CnsVal & 0xFF; - containable = true; - break; - case TYP_SHORT: - newIconVal = (signed short)andOp2CnsVal; - containable = FitsIn<signed short>(andOp2CnsVal); - break; - case TYP_CHAR: - newIconVal = andOp2CnsVal & 0xFFFF; - containable = true; - break; - case TYP_INT: - newIconVal = (INT32)andOp2CnsVal; - containable = FitsIn<INT32>(andOp2CnsVal); - break; - case TYP_UINT: - newIconVal = andOp2CnsVal & 0xFFFFFFFF; - containable = true; - break; - -#ifdef _TARGET_64BIT_ - case TYP_LONG: - newIconVal = (INT64)andOp2CnsVal; - containable = true; - break; - case TYP_ULONG: - newIconVal = (UINT64)andOp2CnsVal; - containable = true; - break; -#endif //_TARGET_64BIT_ - } - - if (containable) - { - andOp2->gtType = andOp1->TypeGet(); - andOp2->AsIntConCommon()->SetIconValue(newIconVal); - } - } - - // Mark the 'andOp1' memory operand as contained - // Note that for equality comparisons we don't need - // to deal with any signed or unsigned issues. - if (containable) - { - MakeSrcContained(op1, andOp1); - } - } - // Mark the 'op1' (the GT_AND) operand as contained - MakeSrcContained(tree, op1); - op1IsMadeContained = true; - - // During Codegen we will now generate "test andOp1, andOp2CnsVal" - } - } - } - else if (op1->OperGet() == GT_CAST) - { - // If the op1 is a cast operation, and cast type is one byte sized unsigned type, - // we can directly use the number in register, instead of doing an extra cast step. - var_types dstType = op1->CastToType(); - bool isUnsignedDst = varTypeIsUnsigned(dstType); - emitAttr castSize = EA_ATTR(genTypeSize(dstType)); - GenTreePtr castOp1 = op1->gtOp.gtOp1; - genTreeOps castOp1Oper = castOp1->OperGet(); - bool safeOper = false; - - // It is not always safe to change the gtType of 'castOp1' to TYP_UBYTE. - // For example when 'castOp1Oper' is a GT_RSZ or GT_RSH then we are shifting - // bits from the left into the lower bits. If we change the type to a TYP_UBYTE - // we will instead generate a byte sized shift operation: shr al, 24 - // For the following ALU operations is it safe to change the gtType to the - // smaller type: - // - if ((castOp1Oper == GT_CNS_INT) || (castOp1Oper == GT_CALL) || // the return value from a Call - (castOp1Oper == GT_LCL_VAR) || castOp1->OperIsLogical() || // GT_AND, GT_OR, GT_XOR - castOp1->isMemoryOp()) // isIndir() || isLclField(); - { - safeOper = true; - } - - if ((castSize == EA_1BYTE) && isUnsignedDst && // Unsigned cast to TYP_UBYTE - safeOper && // Must be a safe operation - !op1->gtOverflow()) // Must not be an overflow checking cast - { - // Currently all of the Oper accepted as 'safeOper' are - // non-overflow checking operations. If we were to add - // an overflow checking operation then this assert needs - // to be moved above to guard entry to this block. - // - assert(!castOp1->gtOverflowEx()); // Must not be an overflow checking operation - - // TODO-Cleanup: we're within "if (CheckImmedAndMakeContained(tree, op2))", so isn't - // the following condition always true? - if (op2->isContainedIntOrIImmed()) - { - ssize_t val = (ssize_t)op2->AsIntConCommon()->IconValue(); - if (val >= 0 && val <= 255) - { - GenTreePtr removeTreeNode = op1; - tree->gtOp.gtOp1 = castOp1; - op1 = castOp1; - castOp1->gtType = TYP_UBYTE; - - // trim down the value if castOp1 is an int constant since its type changed to UBYTE. - if (castOp1Oper == GT_CNS_INT) - { - castOp1->gtIntCon.gtIconVal = (UINT8)castOp1->gtIntCon.gtIconVal; - } - - op2->gtType = TYP_UBYTE; - tree->gtFlags |= GTF_UNSIGNED; - - // right now the op1's type is the same as op2's type. - // if op1 is MemoryOp, we should make the op1 as contained node. - if (castOp1->isMemoryOp()) - { - MakeSrcContained(tree, op1); - op1IsMadeContained = true; - } - - BlockRange().Remove(removeTreeNode); - - // We've changed the type on op1 to TYP_UBYTE, but we already processed that node. - // We need to go back and mark it byteable. - // TODO-Cleanup: it might be better to move this out of the TreeNodeInfoInit pass to - // the earlier "lower" pass, in which case the byteable check would just fall out. - // But that is quite complex! - TreeNodeInfoInitCheckByteable(op1); - -#ifdef DEBUG - if (comp->verbose) - { - printf("TreeNodeInfoInitCmp: Removing a GT_CAST to TYP_UBYTE and changing " - "castOp1->gtType to TYP_UBYTE\n"); - comp->gtDispTreeRange(BlockRange(), tree); - } -#endif - } - } - } - } - - // If not made contained, op1 can be marked as reg-optional. - if (!op1IsMadeContained) - { - SetRegOptional(op1); - - // If op1 codegen sets ZF and SF flags and ==/!= against - // zero, we don't need to generate test instruction, - // provided we don't have another GenTree node between op1 - // and tree that could potentially modify flags. - // - // TODO-CQ: right now the below peep is inexpensive and - // gets the benefit in most of cases because in majority - // of cases op1, op2 and tree would be in that order in - // execution. In general we should be able to check that all - // the nodes that come after op1 in execution order do not - // modify the flags so that it is safe to avoid generating a - // test instruction. Such a check requires that on each - // GenTree node we need to set the info whether its codegen - // will modify flags. - // - // TODO-CQ: We can optimize compare against zero in the - // following cases by generating the branch as indicated - // against each case. - // 1) unsigned compare - // < 0 - always FALSE - // <= 0 - ZF=1 and jne - // > 0 - ZF=0 and je - // >= 0 - always TRUE - // - // 2) signed compare - // < 0 - SF=1 and js - // >= 0 - SF=0 and jns - if (isEqualityCompare && op1->gtSetZSFlags() && op2->IsIntegralConst(0) && (op1->gtNext == op2) && - (op2->gtNext == tree)) - { - // Require codegen of op1 to set the flags. - assert(!op1->gtSetFlags()); - op1->gtFlags |= GTF_SET_FLAGS; - } - } - } - } - } - else if (op1Type == op2Type) - { - if (op2->isMemoryOp()) - { - MakeSrcContained(tree, op2); - } - else if (op1->isMemoryOp() && IsSafeToContainMem(tree, op1)) - { - MakeSrcContained(tree, op1); - } - else if (op1->IsCnsIntOrI()) - { - // TODO-CQ: We should be able to support swapping op1 and op2 to generate cmp reg, imm, - // but there is currently an assert in CodeGen::genCompareInt(). - // https://github.com/dotnet/coreclr/issues/7270 - SetRegOptional(op2); - } - else - { - // One of op1 or op2 could be marked as reg optional - // to indicate that codegen can still generate code - // if one of them is on stack. - SetRegOptional(PreferredRegOptionalOperand(tree)); - } - - if (varTypeIsSmall(op1Type) && varTypeIsUnsigned(op1Type)) - { - // Mark the tree as doing unsigned comparison if - // both the operands are small and unsigned types. - // Otherwise we will end up performing a signed comparison - // of two small unsigned values without zero extending them to - // TYP_INT size and which is incorrect. - tree->gtFlags |= GTF_UNSIGNED; - } - } -} - /* Lower GT_CAST(srcType, DstType) nodes. * * Casts from small int type to float/double are transformed as follows: @@ -4236,312 +950,6 @@ bool Lowering::IsRMWMemOpRootedAtStoreInd(GenTreePtr tree, GenTreePtr* outIndirC return true; } -//-------------------------------------------------------------------------------------------- -// SetStoreIndOpCountsIfRMWMemOp checks to see if there is a RMW memory operation rooted at -// GT_STOREIND node and if so will mark register requirements for nodes under storeInd so -// that CodeGen will generate a single instruction of the form: -// -// binOp [addressing mode], reg -// -// Parameters -// storeInd - GT_STOREIND node -// -// Return value -// True, if RMW memory op tree pattern is recognized and op counts are set. -// False otherwise. -// -bool Lowering::SetStoreIndOpCountsIfRMWMemOp(GenTreePtr storeInd) -{ - assert(storeInd->OperGet() == GT_STOREIND); - - // SSE2 doesn't support RMW on float values - assert(!varTypeIsFloating(storeInd)); - - // Terminology: - // indirDst = memory write of an addr mode (i.e. storeind destination) - // indirSrc = value being written to memory (i.e. storeind source which could a binary/unary op) - // indirCandidate = memory read i.e. a gtInd of an addr mode - // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node) - - GenTreePtr indirCandidate = nullptr; - GenTreePtr indirOpSource = nullptr; - - if (!IsRMWMemOpRootedAtStoreInd(storeInd, &indirCandidate, &indirOpSource)) - { - JITDUMP("Lower of StoreInd didn't mark the node as self contained for reason: %d\n", - storeInd->AsStoreInd()->GetRMWStatus()); - DISPTREERANGE(BlockRange(), storeInd); - return false; - } - - GenTreePtr indirDst = storeInd->gtGetOp1(); - GenTreePtr indirSrc = storeInd->gtGetOp2(); - genTreeOps oper = indirSrc->OperGet(); - - // At this point we have successfully detected a RMW memory op of one of the following forms - // storeInd(indirDst, indirSrc(indirCandidate, indirOpSource)) OR - // storeInd(indirDst, indirSrc(indirOpSource, indirCandidate) in case of commutative operations OR - // storeInd(indirDst, indirSrc(indirCandidate) in case of unary operations - // - // Here indirSrc = one of the supported binary or unary operation for RMW of memory - // indirCandidate = a GT_IND node - // indirCandidateChild = operand of GT_IND indirCandidate - // - // The logic below essentially does the following - // set storeInd src count to that of the dst count of indirOpSource - // clear operand counts on indirSrc (i.e. marked as contained and storeInd will generate code for it) - // clear operand counts on indirCandidate - // clear operand counts on indirDst except when it is a GT_LCL_VAR or GT_CNS_INT that doesn't fit within addr - // base - // Increment src count of storeInd to account for the registers required to form indirDst addr mode - // clear operand counts on indirCandidateChild - - TreeNodeInfo* info = &(storeInd->gtLsraInfo); - info->dstCount = 0; - - if (GenTree::OperIsBinary(oper)) - { - // On Xarch RMW operations require that the source memory-op be in a register. - assert(!indirOpSource->isMemoryOp() || indirOpSource->gtLsraInfo.dstCount == 1); - JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n"); - info->srcCount = indirOpSource->gtLsraInfo.dstCount; - } - else - { - assert(GenTree::OperIsUnary(oper)); - JITDUMP("Lower succesfully detected an assignment of the form: *addrMode = UnaryOp(*addrMode)\n"); - info->srcCount = 0; - } - DISPTREERANGE(BlockRange(), storeInd); - - m_lsra->clearOperandCounts(indirSrc); - m_lsra->clearOperandCounts(indirCandidate); - - GenTreePtr indirCandidateChild = indirCandidate->gtGetOp1(); - if (indirCandidateChild->OperGet() == GT_LEA) - { - GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode(); - - if (addrMode->HasBase()) - { - assert(addrMode->Base()->OperIsLeaf()); - m_lsra->clearOperandCounts(addrMode->Base()); - info->srcCount++; - } - - if (addrMode->HasIndex()) - { - assert(addrMode->Index()->OperIsLeaf()); - m_lsra->clearOperandCounts(addrMode->Index()); - info->srcCount++; - } - - m_lsra->clearOperandCounts(indirDst); - } - else - { - assert(indirCandidateChild->OperGet() == GT_LCL_VAR || indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR || - indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR || indirCandidateChild->OperGet() == GT_CNS_INT); - - // If it is a GT_LCL_VAR, it still needs the reg to hold the address. - // We would still need a reg for GT_CNS_INT if it doesn't fit within addressing mode base. - // For GT_CLS_VAR_ADDR, we don't need a reg to hold the address, because field address value is known at jit - // time. Also, we don't need a reg for GT_CLS_VAR_ADDR. - if (indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR || indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR) - { - m_lsra->clearOperandCounts(indirDst); - } - else if (indirCandidateChild->IsCnsIntOrI() && indirCandidateChild->AsIntConCommon()->FitsInAddrBase(comp)) - { - m_lsra->clearOperandCounts(indirDst); - } - else - { - // Need a reg and hence increment src count of storeind - info->srcCount += indirCandidateChild->gtLsraInfo.dstCount; - } - } - m_lsra->clearOperandCounts(indirCandidateChild); - -#ifdef _TARGET_X86_ - if (varTypeIsByte(storeInd)) - { - // If storeInd is of TYP_BYTE, set indirOpSources to byteable registers. - bool containedNode = indirOpSource->gtLsraInfo.dstCount == 0; - if (!containedNode) - { - regMaskTP regMask = indirOpSource->gtLsraInfo.getSrcCandidates(m_lsra); - assert(regMask != RBM_NONE); - indirOpSource->gtLsraInfo.setSrcCandidates(m_lsra, regMask & ~RBM_NON_BYTE_REGS); - } - } -#endif - - return true; -} - -/** - * Takes care of annotating the src and dst register - * requirements for a GT_MUL treenode. - */ -void Lowering::SetMulOpCounts(GenTreePtr tree) -{ -#if defined(_TARGET_X86_) - assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI || tree->OperGet() == GT_MUL_LONG); -#else - assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI); -#endif - TreeNodeInfo* info = &(tree->gtLsraInfo); - - info->srcCount = 2; - info->dstCount = 1; - - GenTreePtr op1 = tree->gtOp.gtOp1; - GenTreePtr op2 = tree->gtOp.gtOp2; - - // Case of float/double mul. - if (varTypeIsFloating(tree->TypeGet())) - { - assert(tree->OperGet() == GT_MUL); - - if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl()) - { - MakeSrcContained(tree, op2); - } - else if (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))) - { - // Since GT_MUL is commutative, we will try to re-order operands if it is safe to - // generate more efficient code sequence for the case of GT_MUL(op1=memOp, op2=non-memOp) - MakeSrcContained(tree, op1); - } - else - { - // If there are no containable operands, we can make an operand reg optional. - SetRegOptionalForBinOp(tree); - } - return; - } - - bool isUnsignedMultiply = ((tree->gtFlags & GTF_UNSIGNED) != 0); - bool requiresOverflowCheck = tree->gtOverflowEx(); - bool useLeaEncoding = false; - GenTreePtr memOp = nullptr; - - bool hasImpliedFirstOperand = false; - GenTreeIntConCommon* imm = nullptr; - GenTreePtr other = nullptr; - -// There are three forms of x86 multiply: -// one-op form: RDX:RAX = RAX * r/m -// two-op form: reg *= r/m -// three-op form: reg = r/m * imm - -// This special widening 32x32->64 MUL is not used on x64 -#if defined(_TARGET_X86_) - if (tree->OperGet() != GT_MUL_LONG) -#endif - { - assert((tree->gtFlags & GTF_MUL_64RSLT) == 0); - } - - // Multiply should never be using small types - assert(!varTypeIsSmall(tree->TypeGet())); - - // We do use the widening multiply to implement - // the overflow checking for unsigned multiply - // - if (isUnsignedMultiply && requiresOverflowCheck) - { - // The only encoding provided is RDX:RAX = RAX * rm - // - // Here we set RAX as the only destination candidate - // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX - // - info->setDstCandidates(m_lsra, RBM_RAX); - hasImpliedFirstOperand = true; - } - else if (tree->OperGet() == GT_MULHI) - { - // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the - // upper 32 bits of the result set the destination candidate to REG_RDX. - info->setDstCandidates(m_lsra, RBM_RDX); - hasImpliedFirstOperand = true; - } -#if defined(_TARGET_X86_) - else if (tree->OperGet() == GT_MUL_LONG) - { - // have to use the encoding:RDX:RAX = RAX * rm - info->setDstCandidates(m_lsra, RBM_RAX); - hasImpliedFirstOperand = true; - } -#endif - else if (IsContainableImmed(tree, op2) || IsContainableImmed(tree, op1)) - { - if (IsContainableImmed(tree, op2)) - { - imm = op2->AsIntConCommon(); - other = op1; - } - else - { - imm = op1->AsIntConCommon(); - other = op2; - } - - // CQ: We want to rewrite this into a LEA - ssize_t immVal = imm->AsIntConCommon()->IconValue(); - if (!requiresOverflowCheck && (immVal == 3 || immVal == 5 || immVal == 9)) - { - useLeaEncoding = true; - } - - MakeSrcContained(tree, imm); // The imm is always contained - if (other->isMemoryOp()) - { - memOp = other; // memOp may be contained below - } - } - - // We allow one operand to be a contained memory operand. - // The memory op type must match with the 'tree' type. - // This is because during codegen we use 'tree' type to derive EmitTypeSize. - // E.g op1 type = byte, op2 type = byte but GT_MUL tree type is int. - // - if (memOp == nullptr && op2->isMemoryOp()) - { - memOp = op2; - } - - // To generate an LEA we need to force memOp into a register - // so don't allow memOp to be 'contained' - // - if (!useLeaEncoding) - { - if ((memOp != nullptr) && (memOp->TypeGet() == tree->TypeGet()) && IsSafeToContainMem(tree, memOp)) - { - MakeSrcContained(tree, memOp); - } - else if (imm != nullptr) - { - // Has a contained immediate operand. - // Only 'other' operand can be marked as reg optional. - assert(other != nullptr); - SetRegOptional(other); - } - else if (hasImpliedFirstOperand) - { - // Only op2 can be marke as reg optional. - SetRegOptional(op2); - } - else - { - // If there are no containable operands, we can make either of op1 or op2 - // as reg optional. - SetRegOptionalForBinOp(tree); - } - } -} - //------------------------------------------------------------------------------ // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format // @@ -4732,71 +1140,6 @@ GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree) return preferredOp; } -#ifdef _TARGET_X86_ -//------------------------------------------------------------------------ -// ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for -// various reasons -// -// Arguments: -// tree - The node of interest -// -// Return Value: -// If we need to exclude non-byteable registers -// -bool Lowering::ExcludeNonByteableRegisters(GenTree* tree) -{ - // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr' - // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT - // value. In this case we need to exclude esi/edi from the src candidates of op2. - if (varTypeIsByte(tree)) - { - return true; - } - // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool. - else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType())) - { - return true; - } - else if (tree->OperIsCompare()) - { - GenTree* op1 = tree->gtGetOp1(); - GenTree* op2 = tree->gtGetOp2(); - - // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses - // ubyte as the result of comparison and if the result needs to be materialized into a reg - // simply zero extend it to TYP_INT size. Here is an example of generated code: - // cmp dl, byte ptr[addr mode] - // movzx edx, dl - if (varTypeIsByte(op1) && varTypeIsByte(op2)) - { - return true; - } - // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses - // ubyte as the result of the comparison and if the result needs to be materialized into a reg - // simply zero extend it to TYP_INT size. - else if (varTypeIsByte(op1) && op2->IsCnsIntOrI()) - { - return true; - } - // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses - // ubyte as the result of the comparison and if the result needs to be materialized into a reg - // simply zero extend it to TYP_INT size. - else if (op1->IsCnsIntOrI() && varTypeIsByte(op2)) - { - return true; - } - else - { - return false; - } - } - else - { - return false; - } -} -#endif // _TARGET_X86_ - #endif // _TARGET_XARCH_ #endif // !LEGACY_BACKEND |