1 files changed, 3684 insertions, 0 deletions
diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp
new file mode 100644
index 0000000000..a4da2b7ce6
--- /dev/null
+++ b/src/jit/lsraxarch.cpp
@@ -0,0 +1,3684 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                    Register Requirements for AMD64                        XX
+XX                                                                           XX
+XX  This encapsulates all the logic for setting register requirements for    XX
+XX  the AMD64 architecture.                                                  XX
+XX                                                                           XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator
+
+#ifdef _TARGET_XARCH_
+
+#include "jit.h"
+#include "sideeffects.h"
+#include "lower.h"
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitStoreLoc: Set register requirements for a store of a lclVar
+//
+// Arguments:
+//    storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
+//
+// Notes:
+//    This involves:
+//    - Setting the appropriate candidates for a store of a multi-reg call return value.
+//    - Requesting an internal register for SIMD12 stores.
+//    - Handling of contained immediates.
+//    - Widening operations of unsigneds. (TODO: Move to 1st phase of Lowering)
+
+void Lowering::TreeNodeInfoInitStoreLoc(GenTreeLclVarCommon* storeLoc)
+{
+    TreeNodeInfo* info = &(storeLoc->gtLsraInfo);
+
+    // Is this the case of var = call where call is returning
+    // a value in multiple return registers?
+    GenTree* op1 = storeLoc->gtGetOp1();
+    if (op1->IsMultiRegCall())
+    {
+        // backend expects to see this case only for store lclvar.
+        assert(storeLoc->OperGet() == GT_STORE_LCL_VAR);
+
+        // srcCount = number of registers in which the value is returned by call
+        GenTreeCall*    call        = op1->AsCall();
+        ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+        info->srcCount              = retTypeDesc->GetReturnRegCount();
+
+        // Call node srcCandidates = Bitwise-OR(allregs(GetReturnRegType(i))) for all i=0..RetRegCount-1
+        regMaskTP srcCandidates = m_lsra->allMultiRegCallNodeRegs(call);
+        op1->gtLsraInfo.setSrcCandidates(m_lsra, srcCandidates);
+        return;
+    }
+
+#ifdef FEATURE_SIMD
+    if (varTypeIsSIMD(storeLoc))
+    {
+        if (op1->IsCnsIntOrI())
+        {
+            // InitBlk
+            MakeSrcContained(storeLoc, op1);
+        }
+        else if (storeLoc->TypeGet() == TYP_SIMD12)
+        {
+            // Need an additional register to extract upper 4 bytes of Vector3.
+            info->internalFloatCount = 1;
+            info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
+
+            // In this case don't mark the operand as contained as we want it to
+            // be evaluated into an xmm register
+        }
+        return;
+    }
+#endif // FEATURE_SIMD
+
+    // If the source is a containable immediate, make it contained, unless it is
+    // an int-size or larger store of zero to memory, because we can generate smaller code
+    // by zeroing a register and then storing it.
+    if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(storeLoc)))
+    {
+        MakeSrcContained(storeLoc, op1);
+    }
+
+    // TODO: This should be moved to Lowering, but it widens the types, which changes the behavior
+    // of the above condition.
+    LowerStoreLoc(storeLoc);
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInit: Set register requirements for a node
+//
+// Arguments:
+//    treeNode - the node of interest
+//
+// Notes:
+// Preconditions:
+//    LSRA Has been initialized and there is a TreeNodeInfo node
+//    already allocated and initialized for every tree in the IR.
+// Postconditions:
+//    Every TreeNodeInfo instance has the right annotations on register
+//    requirements needed by LSRA to build the Interval Table (source,
+//    destination and internal [temp] register counts).
+//
+void Lowering::TreeNodeInfoInit(GenTree* tree)
+{
+    LinearScan* l        = m_lsra;
+    Compiler*   compiler = comp;
+
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+#ifdef DEBUG
+    if (comp->verbose)
+    {
+        printf("TreeNodeInfoInit:\n");
+        comp->gtDispTreeRange(BlockRange(), tree);
+    }
+#endif
+    // floating type generates AVX instruction (vmovss etc.), set the flag
+    SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
+    switch (tree->OperGet())
+    {
+        GenTree* op1;
+        GenTree* op2;
+
+        default:
+            TreeNodeInfoInitSimple(tree);
+            break;
+
+        case GT_LCL_FLD:
+        case GT_LCL_VAR:
+            info->srcCount = 0;
+            info->dstCount = 1;
+
+#ifdef FEATURE_SIMD
+            // Need an additional register to read upper 4 bytes of Vector3.
+            if (tree->TypeGet() == TYP_SIMD12)
+            {
+                // We need an internal register different from targetReg in which 'tree' produces its result
+                // because both targetReg and internal reg will be in use at the same time.
+                info->internalFloatCount     = 1;
+                info->isInternalRegDelayFree = true;
+                info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
+            }
+#endif
+            break;
+
+        case GT_STORE_LCL_FLD:
+        case GT_STORE_LCL_VAR:
+#ifdef _TARGET_X86_
+            if (tree->gtGetOp1()->OperGet() == GT_LONG)
+            {
+                info->srcCount = 2;
+            }
+            else
+#endif // _TARGET_X86_
+            {
+                info->srcCount = 1;
+            }
+            info->dstCount = 0;
+            TreeNodeInfoInitStoreLoc(tree->AsLclVarCommon());
+            break;
+
+        case GT_BOX:
+            noway_assert(!"box should not exist here");
+            // The result of 'op1' is also the final result
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_PHYSREGDST:
+            info->srcCount = 1;
+            info->dstCount = 0;
+            break;
+
+        case GT_COMMA:
+        {
+            GenTreePtr firstOperand;
+            GenTreePtr secondOperand;
+            if (tree->gtFlags & GTF_REVERSE_OPS)
+            {
+                firstOperand  = tree->gtOp.gtOp2;
+                secondOperand = tree->gtOp.gtOp1;
+            }
+            else
+            {
+                firstOperand  = tree->gtOp.gtOp1;
+                secondOperand = tree->gtOp.gtOp2;
+            }
+            if (firstOperand->TypeGet() != TYP_VOID)
+            {
+                firstOperand->gtLsraInfo.isLocalDefUse = true;
+                firstOperand->gtLsraInfo.dstCount      = 0;
+            }
+            if (tree->TypeGet() == TYP_VOID && secondOperand->TypeGet() != TYP_VOID)
+            {
+                secondOperand->gtLsraInfo.isLocalDefUse = true;
+                secondOperand->gtLsraInfo.dstCount      = 0;
+            }
+        }
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_LIST:
+        case GT_FIELD_LIST:
+        case GT_ARGPLACE:
+        case GT_NO_OP:
+        case GT_START_NONGC:
+        case GT_PROF_HOOK:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_CNS_DBL:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            break;
+
+#if !defined(_TARGET_64BIT_)
+
+        case GT_LONG:
+            if ((tree->gtLIRFlags & LIR::Flags::IsUnusedValue) != 0)
+            {
+                // An unused GT_LONG node needs to consume its sources.
+                info->srcCount = 2;
+            }
+            else
+            {
+                // Passthrough
+                info->srcCount = 0;
+            }
+
+            info->dstCount = 0;
+            break;
+
+#endif // !defined(_TARGET_64BIT_)
+
+        case GT_QMARK:
+        case GT_COLON:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            unreached();
+            break;
+
+        case GT_RETURN:
+            TreeNodeInfoInitReturn(tree);
+            break;
+
+        case GT_RETFILT:
+            if (tree->TypeGet() == TYP_VOID)
+            {
+                info->srcCount = 0;
+                info->dstCount = 0;
+            }
+            else
+            {
+                assert(tree->TypeGet() == TYP_INT);
+
+                info->srcCount = 1;
+                info->dstCount = 0;
+
+                info->setSrcCandidates(l, RBM_INTRET);
+                tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET);
+            }
+            break;
+
+        // A GT_NOP is either a passthrough (if it is void, or if it has
+        // a child), but must be considered to produce a dummy value if it
+        // has a type but no child
+        case GT_NOP:
+            info->srcCount = 0;
+            if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
+            {
+                info->dstCount = 1;
+            }
+            else
+            {
+                info->dstCount = 0;
+            }
+            break;
+
+        case GT_JTRUE:
+        {
+            info->srcCount = 0;
+            info->dstCount = 0;
+
+            GenTree* cmp = tree->gtGetOp1();
+            l->clearDstCount(cmp);
+
+#ifdef FEATURE_SIMD
+            // Say we have the following IR
+            //   simdCompareResult = GT_SIMD((In)Equality, v1, v2)
+            //   integerCompareResult = GT_EQ/NE(simdCompareResult, true/false)
+            //   GT_JTRUE(integerCompareResult)
+            //
+            // In this case we don't need to generate code for GT_EQ_/NE, since SIMD (In)Equality
+            // intrinsic will set or clear the Zero flag.
+
+            genTreeOps cmpOper = cmp->OperGet();
+            if (cmpOper == GT_EQ || cmpOper == GT_NE)
+            {
+                GenTree* cmpOp1 = cmp->gtGetOp1();
+                GenTree* cmpOp2 = cmp->gtGetOp2();
+
+                if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1)))
+                {
+                    // We always generate code for a SIMD equality comparison, but the compare
+                    // is contained (evaluated as part of the GT_JTRUE).
+                    // Neither the SIMD node nor the immediate need to be evaluated into a register.
+                    l->clearOperandCounts(cmp);
+                    l->clearDstCount(cmpOp1);
+                    l->clearOperandCounts(cmpOp2);
+
+                    // Codegen of SIMD (in)Equality uses target integer reg only for setting flags.
+                    // A target reg is not needed on AVX when comparing against Vector Zero.
+                    // In all other cases we need to reserve an int type internal register, since we
+                    // have cleared dstCount.
+                    if (!compiler->canUseAVX() || !cmpOp1->gtGetOp2()->IsIntegralConstVector(0))
+                    {
+                        ++(cmpOp1->gtLsraInfo.internalIntCount);
+                        regMaskTP internalCandidates = cmpOp1->gtLsraInfo.getInternalCandidates(l);
+                        internalCandidates |= l->allRegs(TYP_INT);
+                        cmpOp1->gtLsraInfo.setInternalCandidates(l, internalCandidates);
+                    }
+
+                    // We have to reverse compare oper in the following cases:
+                    // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it.
+                    //    Therefore, if compare oper is == or != against false(0), we will
+                    //    be checking opposite of what is required.
+                    //
+                    // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it.
+                    //    Therefore, if compare oper is == or != against true(1), we will
+                    //    be checking opposite of what is required.
+                    GenTreeSIMD* simdNode = cmpOp1->AsSIMD();
+                    if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality)
+                    {
+                        if (cmpOp2->IsIntegralConst(0))
+                        {
+                            cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+                        }
+                    }
+                    else
+                    {
+                        assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality);
+                        if (cmpOp2->IsIntegralConst(1))
+                        {
+                            cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+                        }
+                    }
+                }
+            }
+#endif // FEATURE_SIMD
+        }
+        break;
+
+        case GT_JCC:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_JMP:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_SWITCH:
+            // This should never occur since switch nodes must not be visible at this
+            // point in the JIT.
+            info->srcCount = 0;
+            info->dstCount = 0; // To avoid getting uninit errors.
+            noway_assert(!"Switch must be lowered at this point");
+            break;
+
+        case GT_JMPTABLE:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            break;
+
+        case GT_SWITCH_TABLE:
+            info->srcCount         = 2;
+            info->internalIntCount = 1;
+            info->dstCount         = 0;
+            break;
+
+        case GT_ASG:
+        case GT_ASG_ADD:
+        case GT_ASG_SUB:
+            noway_assert(!"We should never hit any assignment operator in lowering");
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+#if !defined(_TARGET_64BIT_)
+        case GT_ADD_LO:
+        case GT_ADD_HI:
+        case GT_SUB_LO:
+        case GT_SUB_HI:
+#endif
+        case GT_ADD:
+        case GT_SUB:
+            // SSE2 arithmetic instructions doesn't support the form "op mem, xmm".
+            // Rather they only support "op xmm, mem/xmm" form.
+            if (varTypeIsFloating(tree->TypeGet()))
+            {
+                // overflow operations aren't supported on float/double types.
+                assert(!tree->gtOverflow());
+
+                op1 = tree->gtGetOp1();
+                op2 = tree->gtGetOp2();
+
+                // No implicit conversions at this stage as the expectation is that
+                // everything is made explicit by adding casts.
+                assert(op1->TypeGet() == op2->TypeGet());
+
+                info->srcCount = 2;
+                info->dstCount = 1;
+
+                if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
+                {
+                    MakeSrcContained(tree, op2);
+                }
+                else if (tree->OperIsCommutative() &&
+                         (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))))
+                {
+                    // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
+                    // as long as it is safe so that the following efficient code sequence is generated:
+                    //      addss/sd targetReg, memOp    (if op1Reg == targetReg) OR
+                    //      movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
+                    //
+                    // Instead of
+                    //      movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg  (if op1Reg == targetReg) OR
+                    //      movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg
+                    MakeSrcContained(tree, op1);
+                }
+                else
+                {
+                    // If there are no containable operands, we can make an operand reg optional.
+                    SetRegOptionalForBinOp(tree);
+                }
+                break;
+            }
+
+            __fallthrough;
+
+        case GT_AND:
+        case GT_OR:
+        case GT_XOR:
+            TreeNodeInfoInitLogicalOp(tree);
+            break;
+
+        case GT_RETURNTRAP:
+            // This just turns into a compare of its child with an int + a conditional call
+            info->srcCount = 1;
+            info->dstCount = 0;
+            if (tree->gtOp.gtOp1->isIndir())
+            {
+                MakeSrcContained(tree, tree->gtOp.gtOp1);
+            }
+            info->internalIntCount = 1;
+            info->setInternalCandidates(l, l->allRegs(TYP_INT));
+            break;
+
+        case GT_MOD:
+        case GT_DIV:
+        case GT_UMOD:
+        case GT_UDIV:
+            TreeNodeInfoInitModDiv(tree);
+            break;
+
+        case GT_MUL:
+        case GT_MULHI:
+#if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
+        case GT_MUL_LONG:
+#endif
+            TreeNodeInfoInitMul(tree);
+            break;
+
+        case GT_INTRINSIC:
+            TreeNodeInfoInitIntrinsic(tree);
+            break;
+
+#ifdef FEATURE_SIMD
+        case GT_SIMD:
+            TreeNodeInfoInitSIMD(tree);
+            break;
+#endif // FEATURE_SIMD
+
+        case GT_CAST:
+            TreeNodeInfoInitCast(tree);
+            break;
+
+        case GT_NEG:
+            info->srcCount = 1;
+            info->dstCount = 1;
+
+            // TODO-XArch-CQ:
+            // SSE instruction set doesn't have an instruction to negate a number.
+            // The recommended way is to xor the float/double number with a bitmask.
+            // The only way to xor is using xorps or xorpd both of which operate on
+            // 128-bit operands.  To hold the bit-mask we would need another xmm
+            // register or a 16-byte aligned 128-bit data constant. Right now emitter
+            // lacks the support for emitting such constants or instruction with mem
+            // addressing mode referring to a 128-bit operand. For now we use an
+            // internal xmm register to load 32/64-bit bitmask from data section.
+            // Note that by trading additional data section memory (128-bit) we can
+            // save on the need for an internal register and also a memory-to-reg
+            // move.
+            //
+            // Note: another option to avoid internal register requirement is by
+            // lowering as GT_SUB(0, src).  This will generate code different from
+            // Jit64 and could possibly result in compat issues (?).
+            if (varTypeIsFloating(tree))
+            {
+                info->internalFloatCount = 1;
+                info->setInternalCandidates(l, l->internalFloatRegCandidates());
+            }
+            else
+            {
+                // Codegen of this tree node sets ZF and SF flags.
+                tree->gtFlags |= GTF_ZSF_SET;
+            }
+            break;
+
+        case GT_NOT:
+            info->srcCount = 1;
+            info->dstCount = 1;
+            break;
+
+        case GT_LSH:
+        case GT_RSH:
+        case GT_RSZ:
+        case GT_ROL:
+        case GT_ROR:
+#ifdef _TARGET_X86_
+        case GT_LSH_HI:
+        case GT_RSH_LO:
+#endif
+            TreeNodeInfoInitShiftRotate(tree);
+            break;
+
+        case GT_EQ:
+        case GT_NE:
+        case GT_LT:
+        case GT_LE:
+        case GT_GE:
+        case GT_GT:
+        case GT_TEST_EQ:
+        case GT_TEST_NE:
+            TreeNodeInfoInitCmp(tree);
+            break;
+
+        case GT_CKFINITE:
+            info->srcCount         = 1;
+            info->dstCount         = 1;
+            info->internalIntCount = 1;
+            break;
+
+        case GT_CMPXCHG:
+            info->srcCount = 3;
+            info->dstCount = 1;
+
+            // comparand is preferenced to RAX.
+            // Remaining two operands can be in any reg other than RAX.
+            tree->gtCmpXchg.gtOpComparand->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+            tree->gtCmpXchg.gtOpLocation->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
+            tree->gtCmpXchg.gtOpValue->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
+            tree->gtLsraInfo.setDstCandidates(l, RBM_RAX);
+            break;
+
+        case GT_LOCKADD:
+            info->srcCount = 2;
+            info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+
+            CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
+            break;
+
+        case GT_CALL:
+            TreeNodeInfoInitCall(tree->AsCall());
+            break;
+
+        case GT_ADDR:
+        {
+            // For a GT_ADDR, the child node should not be evaluated into a register
+            GenTreePtr child = tree->gtOp.gtOp1;
+            assert(!l->isCandidateLocalRef(child));
+            l->clearDstCount(child);
+            info->srcCount = 0;
+            info->dstCount = 1;
+        }
+        break;
+
+#if !defined(FEATURE_PUT_STRUCT_ARG_STK)
+        case GT_OBJ:
+#endif
+        case GT_BLK:
+        case GT_DYN_BLK:
+            // These should all be eliminated prior to Lowering.
+            assert(!"Non-store block node in Lowering");
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+        case GT_PUTARG_STK:
+            LowerPutArgStk(tree->AsPutArgStk());
+            TreeNodeInfoInitPutArgStk(tree->AsPutArgStk());
+            break;
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+
+        case GT_STORE_BLK:
+        case GT_STORE_OBJ:
+        case GT_STORE_DYN_BLK:
+            LowerBlockStore(tree->AsBlk());
+            TreeNodeInfoInitBlockStore(tree->AsBlk());
+            break;
+
+        case GT_INIT_VAL:
+            // Always a passthrough of its child's value.
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_LCLHEAP:
+            TreeNodeInfoInitLclHeap(tree);
+            break;
+
+        case GT_ARR_BOUNDS_CHECK:
+#ifdef FEATURE_SIMD
+        case GT_SIMD_CHK:
+#endif // FEATURE_SIMD
+        {
+            GenTreeBoundsChk* node = tree->AsBoundsChk();
+            // Consumes arrLen & index - has no result
+            info->srcCount = 2;
+            info->dstCount = 0;
+
+            GenTreePtr other;
+            if (CheckImmedAndMakeContained(tree, node->gtIndex))
+            {
+                other = node->gtArrLen;
+            }
+            else if (CheckImmedAndMakeContained(tree, node->gtArrLen))
+            {
+                other = node->gtIndex;
+            }
+            else if (node->gtIndex->isMemoryOp())
+            {
+                other = node->gtIndex;
+            }
+            else
+            {
+                other = node->gtArrLen;
+            }
+
+            if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
+            {
+                if (other->isMemoryOp())
+                {
+                    MakeSrcContained(tree, other);
+                }
+                else
+                {
+                    // We can mark 'other' as reg optional, since it is not contained.
+                    SetRegOptional(other);
+                }
+            }
+        }
+        break;
+
+        case GT_ARR_ELEM:
+            // These must have been lowered to GT_ARR_INDEX
+            noway_assert(!"We should never see a GT_ARR_ELEM in lowering");
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_ARR_INDEX:
+            info->srcCount = 2;
+            info->dstCount = 1;
+            // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
+            // times while the result is being computed.
+            tree->AsArrIndex()->ArrObj()->gtLsraInfo.isDelayFree = true;
+            info->hasDelayFreeSrc                                = true;
+            break;
+
+        case GT_ARR_OFFSET:
+            // This consumes the offset, if any, the arrObj and the effective index,
+            // and produces the flattened offset for this dimension.
+            info->srcCount = 3;
+            info->dstCount = 1;
+
+            // we don't want to generate code for this
+            if (tree->gtArrOffs.gtOffset->IsIntegralConst(0))
+            {
+                MakeSrcContained(tree, tree->gtArrOffs.gtOffset);
+            }
+            else
+            {
+                // Here we simply need an internal register, which must be different
+                // from any of the operand's registers, but may be the same as targetReg.
+                info->internalIntCount = 1;
+            }
+            break;
+
+        case GT_LEA:
+            // The LEA usually passes its operands through to the GT_IND, in which case we'll
+            // clear the info->srcCount and info->dstCount later, but we may be instantiating an address,
+            // so we set them here.
+            info->srcCount = 0;
+            if (tree->AsAddrMode()->HasBase())
+            {
+                info->srcCount++;
+            }
+            if (tree->AsAddrMode()->HasIndex())
+            {
+                info->srcCount++;
+            }
+            info->dstCount = 1;
+            break;
+
+        case GT_STOREIND:
+        {
+            info->srcCount = 2;
+            info->dstCount = 0;
+            GenTree* src   = tree->gtOp.gtOp2;
+
+            if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
+            {
+                TreeNodeInfoInitGCWriteBarrier(tree);
+                break;
+            }
+
+            // If the source is a containable immediate, make it contained, unless it is
+            // an int-size or larger store of zero to memory, because we can generate smaller code
+            // by zeroing a register and then storing it.
+            if (IsContainableImmed(tree, src) &&
+                (!src->IsIntegralConst(0) || varTypeIsSmall(tree) || tree->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
+            {
+                MakeSrcContained(tree, src);
+            }
+            else if (!varTypeIsFloating(tree))
+            {
+                // Perform recognition of trees with the following structure:
+                //        StoreInd(addr, BinOp(expr, GT_IND(addr)))
+                // to be able to fold this into an instruction of the form
+                //        BINOP [addr], register
+                // where register is the actual place where 'expr' is computed.
+                //
+                // SSE2 doesn't support RMW form of instructions.
+                if (TreeNodeInfoInitIfRMWMemOp(tree))
+                {
+                    break;
+                }
+            }
+
+            TreeNodeInfoInitIndir(tree);
+        }
+        break;
+
+        case GT_NULLCHECK:
+            info->dstCount      = 0;
+            info->srcCount      = 1;
+            info->isLocalDefUse = true;
+            break;
+
+        case GT_IND:
+            info->dstCount = 1;
+            info->srcCount = 1;
+            TreeNodeInfoInitIndir(tree);
+            break;
+
+        case GT_CATCH_ARG:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            info->setDstCandidates(l, RBM_EXCEPTION_OBJECT);
+            break;
+
+#if !FEATURE_EH_FUNCLETS
+        case GT_END_LFIN:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+#endif
+
+        case GT_CLS_VAR:
+            // These nodes are eliminated by rationalizer.
+            JITDUMP("Unexpected node %s in Lower.\n", GenTree::NodeName(tree->OperGet()));
+            unreached();
+            break;
+    } // end switch (tree->OperGet())
+
+    // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
+    // Even then we would like to set isTgtPref on Op1.
+    if (tree->OperIsBinary() && info->srcCount >= 1)
+    {
+        if (isRMWRegOper(tree))
+        {
+            GenTree* op1 = tree->gtOp.gtOp1;
+            GenTree* op2 = tree->gtOp.gtOp2;
+
+            // Commutative opers like add/mul/and/or/xor could reverse the order of
+            // operands if it is safe to do so.  In such a case we would like op2 to be
+            // target preferenced instead of op1.
+            if (tree->OperIsCommutative() && op1->gtLsraInfo.dstCount == 0 && op2 != nullptr)
+            {
+                op1 = op2;
+                op2 = tree->gtOp.gtOp1;
+            }
+
+            // If we have a read-modify-write operation, we want to preference op1 to the target.
+            // If op1 is contained, we don't want to preference it, but it won't
+            // show up as a source in that case, so it will be ignored.
+            op1->gtLsraInfo.isTgtPref = true;
+
+            // Is this a non-commutative operator, or is op2 a contained memory op?
+            // (Note that we can't call IsContained() at this point because it uses exactly the
+            // same information we're currently computing.)
+            // In either case, we need to make op2 remain live until the op is complete, by marking
+            // the source(s) associated with op2 as "delayFree".
+            // Note that if op2 of a binary RMW operator is a memory op, even if the operator
+            // is commutative, codegen cannot reverse them.
+            // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
+            // more work to be done to correctly reverse the operands if they involve memory
+            // operands.  Also, we may need to handle more cases than GT_IND, especially once
+            // we've modified the register allocator to not require all nodes to be assigned
+            // a register (e.g. a spilled lclVar can often be referenced directly from memory).
+            // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
+
+            GenTree* delayUseSrc = nullptr;
+            // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
+            // to special case them.
+            if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD)
+            {
+                // These tree nodes will have their op1 marked as isDelayFree=true.
+                // Hence these tree nodes should have a Def position so that op1's reg
+                // gets freed at DefLoc+1.
+                if (tree->TypeGet() == TYP_VOID)
+                {
+                    // Right now a GT_XADD node could be morphed into a
+                    // GT_LOCKADD of TYP_VOID. See gtExtractSideEffList().
+                    // Note that it is advantageous to use GT_LOCKADD
+                    // instead of of GT_XADD as the former uses lock.add,
+                    // which allows its second operand to be a contained
+                    // immediate wheres xadd instruction requires its
+                    // second operand to be in a register.
+                    assert(tree->gtLsraInfo.dstCount == 0);
+
+                    // Give it an artificial type and mark it isLocalDefUse = true.
+                    // This would result in a Def position created but not considered
+                    // consumed by its parent node.
+                    tree->gtType                   = TYP_INT;
+                    tree->gtLsraInfo.isLocalDefUse = true;
+                }
+                else
+                {
+                    assert(tree->gtLsraInfo.dstCount != 0);
+                }
+
+                delayUseSrc = op1;
+            }
+            else if ((op2 != nullptr) &&
+                     (!tree->OperIsCommutative() || (op2->isMemoryOp() && (op2->gtLsraInfo.srcCount == 0))))
+            {
+                delayUseSrc = op2;
+            }
+            if (delayUseSrc != nullptr)
+            {
+                // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree'
+                // on the base & index, if any.
+                // Otherwise, we set it on delayUseSrc itself.
+                if (delayUseSrc->isIndir() && (delayUseSrc->gtLsraInfo.dstCount == 0))
+                {
+                    GenTree* base  = delayUseSrc->AsIndir()->Base();
+                    GenTree* index = delayUseSrc->AsIndir()->Index();
+                    if (base != nullptr)
+                    {
+                        base->gtLsraInfo.isDelayFree = true;
+                    }
+                    if (index != nullptr)
+                    {
+                        index->gtLsraInfo.isDelayFree = true;
+                    }
+                }
+                else
+                {
+                    delayUseSrc->gtLsraInfo.isDelayFree = true;
+                }
+                info->hasDelayFreeSrc = true;
+            }
+        }
+    }
+
+    TreeNodeInfoInitCheckByteable(tree);
+
+    // We need to be sure that we've set info->srcCount and info->dstCount appropriately
+    assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCheckByteable: Check the tree to see if "byte-able" registers are
+// required, and set the tree node info accordingly.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCheckByteable(GenTree* tree)
+{
+#ifdef _TARGET_X86_
+    LinearScan*   l    = m_lsra;
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+    // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
+    // if the tree node is a byte type.
+    //
+    // Though this looks conservative in theory, in practice we could not think of a case where
+    // the below logic leads to conservative register specification.  In future when or if we find
+    // one such case, this logic needs to be fine tuned for that case(s).
+
+    if (ExcludeNonByteableRegisters(tree))
+    {
+        regMaskTP regMask;
+        if (info->dstCount > 0)
+        {
+            regMask = info->getDstCandidates(l);
+            assert(regMask != RBM_NONE);
+            info->setDstCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
+        }
+
+        if (tree->OperIsSimple() && (info->srcCount > 0))
+        {
+            // No need to set src candidates on a contained child operand.
+            GenTree* op = tree->gtOp.gtOp1;
+            assert(op != nullptr);
+            bool containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
+            if (!containedNode)
+            {
+                regMask = op->gtLsraInfo.getSrcCandidates(l);
+                assert(regMask != RBM_NONE);
+                op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
+            }
+
+            if (tree->OperIsBinary() && (tree->gtOp.gtOp2 != nullptr))
+            {
+                op            = tree->gtOp.gtOp2;
+                containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
+                if (!containedNode)
+                {
+                    regMask = op->gtLsraInfo.getSrcCandidates(l);
+                    assert(regMask != RBM_NONE);
+                    op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
+                }
+            }
+        }
+    }
+#endif //_TARGET_X86_
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitSimple: Sets the srcCount and dstCount for all the trees
+// without special handling based on the tree node type.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitSimple(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+    unsigned      kind = tree->OperKind();
+    info->dstCount     = tree->IsValue() ? 1 : 0;
+    if (kind & (GTK_CONST | GTK_LEAF))
+    {
+        info->srcCount = 0;
+    }
+    else if (kind & (GTK_SMPOP))
+    {
+        if (tree->gtGetOp2IfPresent() != nullptr)
+        {
+            info->srcCount = 2;
+        }
+        else
+        {
+            info->srcCount = 1;
+        }
+    }
+    else
+    {
+        unreached();
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitReturn: Set the NodeInfo for a GT_RETURN.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitReturn(GenTree* tree)
+{
+    TreeNodeInfo* info     = &(tree->gtLsraInfo);
+    LinearScan*   l        = m_lsra;
+    Compiler*     compiler = comp;
+
+#if !defined(_TARGET_64BIT_)
+    if (tree->TypeGet() == TYP_LONG)
+    {
+        GenTree* op1 = tree->gtGetOp1();
+        noway_assert(op1->OperGet() == GT_LONG);
+        GenTree* loVal = op1->gtGetOp1();
+        GenTree* hiVal = op1->gtGetOp2();
+        info->srcCount = 2;
+        loVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_LO);
+        hiVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_HI);
+        info->dstCount = 0;
+    }
+    else
+#endif // !defined(_TARGET_64BIT_)
+    {
+        GenTree*  op1           = tree->gtGetOp1();
+        regMaskTP useCandidates = RBM_NONE;
+
+        info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+        info->dstCount = 0;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        if (varTypeIsStruct(tree))
+        {
+            // op1 has to be either an lclvar or a multi-reg returning call
+            if (op1->OperGet() == GT_LCL_VAR)
+            {
+                GenTreeLclVarCommon* lclVarCommon = op1->AsLclVarCommon();
+                LclVarDsc*           varDsc       = &(compiler->lvaTable[lclVarCommon->gtLclNum]);
+                assert(varDsc->lvIsMultiRegRet);
+
+                // Mark var as contained if not enregistrable.
+                if (!varTypeIsEnregisterableStruct(op1))
+                {
+                    MakeSrcContained(tree, op1);
+                }
+            }
+            else
+            {
+                noway_assert(op1->IsMultiRegCall());
+
+                ReturnTypeDesc* retTypeDesc = op1->AsCall()->GetReturnTypeDesc();
+                info->srcCount              = retTypeDesc->GetReturnRegCount();
+                useCandidates               = retTypeDesc->GetABIReturnRegs();
+            }
+        }
+        else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+        {
+            // Non-struct type return - determine useCandidates
+            switch (tree->TypeGet())
+            {
+                case TYP_VOID:
+                    useCandidates = RBM_NONE;
+                    break;
+                case TYP_FLOAT:
+                    useCandidates = RBM_FLOATRET;
+                    break;
+                case TYP_DOUBLE:
+                    useCandidates = RBM_DOUBLERET;
+                    break;
+#if defined(_TARGET_64BIT_)
+                case TYP_LONG:
+                    useCandidates = RBM_LNGRET;
+                    break;
+#endif // defined(_TARGET_64BIT_)
+                default:
+                    useCandidates = RBM_INTRET;
+                    break;
+            }
+        }
+
+        if (useCandidates != RBM_NONE)
+        {
+            op1->gtLsraInfo.setSrcCandidates(l, useCandidates);
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitShiftRotate: Set the NodeInfo for a shift or rotate.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+    LinearScan*   l    = m_lsra;
+
+    info->srcCount = 2;
+    info->dstCount = 1;
+
+    // For shift operations, we need that the number
+    // of bits moved gets stored in CL in case
+    // the number of bits to shift is not a constant.
+    GenTreePtr shiftBy = tree->gtOp.gtOp2;
+    GenTreePtr source  = tree->gtOp.gtOp1;
+
+#ifdef _TARGET_X86_
+    // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
+    // we can have a three operand form. Increment the srcCount.
+    if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
+    {
+        assert(source->OperGet() == GT_LONG);
+
+        info->srcCount++;
+
+        if (tree->OperGet() == GT_LSH_HI)
+        {
+            GenTreePtr sourceLo              = source->gtOp.gtOp1;
+            sourceLo->gtLsraInfo.isDelayFree = true;
+        }
+        else
+        {
+            GenTreePtr sourceHi              = source->gtOp.gtOp2;
+            sourceHi->gtLsraInfo.isDelayFree = true;
+        }
+
+        source->gtLsraInfo.hasDelayFreeSrc = true;
+        info->hasDelayFreeSrc              = true;
+    }
+#endif
+
+    // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
+    // We will allow whatever can be encoded - hope you know what you are doing.
+    if (!IsContainableImmed(tree, shiftBy) || (shiftBy->gtIntConCommon.IconValue() > 255) ||
+        (shiftBy->gtIntConCommon.IconValue() < 0))
+    {
+        source->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
+        shiftBy->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
+        info->setDstCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
+    }
+    else
+    {
+        MakeSrcContained(tree, shiftBy);
+
+        // Note that Rotate Left/Right instructions don't set ZF and SF flags.
+        //
+        // If the operand being shifted is 32-bits then upper three bits are masked
+        // by hardware to get actual shift count.  Similarly for 64-bit operands
+        // shift count is narrowed to [0..63].  If the resulting shift count is zero,
+        // then shift operation won't modify flags.
+        //
+        // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
+        // if the shift count is known to be non-zero and in the range depending on the
+        // operand size.
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCall: Set the NodeInfo for a call.
+//
+// Arguments:
+//    call      - The call node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
+{
+    TreeNodeInfo*   info              = &(call->gtLsraInfo);
+    LinearScan*     l                 = m_lsra;
+    Compiler*       compiler          = comp;
+    bool            hasMultiRegRetVal = false;
+    ReturnTypeDesc* retTypeDesc       = nullptr;
+
+    info->srcCount = 0;
+    if (call->TypeGet() != TYP_VOID)
+    {
+        hasMultiRegRetVal = call->HasMultiRegRetVal();
+        if (hasMultiRegRetVal)
+        {
+            // dst count = number of registers in which the value is returned by call
+            retTypeDesc    = call->GetReturnTypeDesc();
+            info->dstCount = retTypeDesc->GetReturnRegCount();
+        }
+        else
+        {
+            info->dstCount = 1;
+        }
+    }
+    else
+    {
+        info->dstCount = 0;
+    }
+
+    GenTree* ctrlExpr = call->gtControlExpr;
+    if (call->gtCallType == CT_INDIRECT)
+    {
+        // either gtControlExpr != null or gtCallAddr != null.
+        // Both cannot be non-null at the same time.
+        assert(ctrlExpr == nullptr);
+        assert(call->gtCallAddr != nullptr);
+        ctrlExpr = call->gtCallAddr;
+
+#ifdef _TARGET_X86_
+        // Fast tail calls aren't currently supported on x86, but if they ever are, the code
+        // below that handles indirect VSD calls will need to be fixed.
+        assert(!call->IsFastTailCall() || !call->IsVirtualStub());
+#endif // _TARGET_X86_
+    }
+
+    // set reg requirements on call target represented as control sequence.
+    if (ctrlExpr != nullptr)
+    {
+        // we should never see a gtControlExpr whose type is void.
+        assert(ctrlExpr->TypeGet() != TYP_VOID);
+
+        // call can take a Rm op on x64
+        info->srcCount++;
+
+        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
+        // computed into a register.
+        if (!call->IsFastTailCall())
+        {
+#ifdef _TARGET_X86_
+            // On x86, we need to generate a very specific pattern for indirect VSD calls:
+            //
+            //    3-byte nop
+            //    call dword ptr [eax]
+            //
+            // Where EAX is also used as an argument to the stub dispatch helper. Make
+            // sure that the call target address is computed into EAX in this case.
+            if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
+            {
+                assert(ctrlExpr->isIndir());
+
+                ctrlExpr->gtGetOp1()->gtLsraInfo.setSrcCandidates(l, RBM_VIRTUAL_STUB_TARGET);
+                MakeSrcContained(call, ctrlExpr);
+            }
+            else
+#endif // _TARGET_X86_
+                if (ctrlExpr->isIndir())
+            {
+                MakeSrcContained(call, ctrlExpr);
+            }
+        }
+        else
+        {
+            // Fast tail call - make sure that call target is always computed in RAX
+            // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
+            ctrlExpr->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+        }
+    }
+
+    // If this is a varargs call, we will clear the internal candidates in case we need
+    // to reserve some integer registers for copying float args.
+    // We have to do this because otherwise the default candidates are allRegs, and adding
+    // the individual specific registers will have no effect.
+    if (call->IsVarargs())
+    {
+        info->setInternalCandidates(l, RBM_NONE);
+    }
+
+    RegisterType registerType = call->TypeGet();
+
+    // Set destination candidates for return value of the call.
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef _TARGET_X86_
+    if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
+    {
+        // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
+        // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
+        // correct argument registers.
+        info->setDstCandidates(l, RBM_PINVOKE_TCB);
+    }
+    else
+#endif // _TARGET_X86_
+        if (hasMultiRegRetVal)
+    {
+        assert(retTypeDesc != nullptr);
+        info->setDstCandidates(l, retTypeDesc->GetABIReturnRegs());
+    }
+    else if (varTypeIsFloating(registerType))
+    {
+#ifdef _TARGET_X86_
+        // The return value will be on the X87 stack, and we will need to move it.
+        info->setDstCandidates(l, l->allRegs(registerType));
+#else  // !_TARGET_X86_
+        info->setDstCandidates(l, RBM_FLOATRET);
+#endif // !_TARGET_X86_
+    }
+    else if (registerType == TYP_LONG)
+    {
+        info->setDstCandidates(l, RBM_LNGRET);
+    }
+    else
+    {
+        info->setDstCandidates(l, RBM_INTRET);
+    }
+
+    // number of args to a call =
+    // callRegArgs + (callargs - placeholders, setup, etc)
+    // there is an explicit thisPtr but it is redundant
+
+    // If there is an explicit this pointer, we don't want that node to produce anything
+    // as it is redundant
+    if (call->gtCallObjp != nullptr)
+    {
+        GenTreePtr thisPtrNode = call->gtCallObjp;
+
+        if (thisPtrNode->gtOper == GT_PUTARG_REG)
+        {
+            l->clearOperandCounts(thisPtrNode);
+            l->clearDstCount(thisPtrNode->gtOp.gtOp1);
+        }
+        else
+        {
+            l->clearDstCount(thisPtrNode);
+        }
+    }
+
+#if FEATURE_VARARG
+    bool callHasFloatRegArgs = false;
+#endif // !FEATURE_VARARG
+
+    // First, count reg args
+    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+    {
+        assert(list->OperIsList());
+
+        GenTreePtr argNode = list->Current();
+
+        fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
+        assert(curArgTabEntry);
+
+        if (curArgTabEntry->regNum == REG_STK)
+        {
+            // late arg that is not passed in a register
+            DISPNODE(argNode);
+            assert(argNode->gtOper == GT_PUTARG_STK);
+            argNode->gtLsraInfo.srcCount = 1;
+            argNode->gtLsraInfo.dstCount = 0;
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+            // If the node is TYP_STRUCT and it is put on stack with
+            // putarg_stk operation, we consume and produce no registers.
+            // In this case the embedded Obj node should not produce
+            // registers too since it is contained.
+            // Note that if it is a SIMD type the argument will be in a register.
+            if (argNode->TypeGet() == TYP_STRUCT)
+            {
+                assert(argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_OBJ);
+                argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0;
+                argNode->gtLsraInfo.srcCount             = 0;
+            }
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+            continue;
+        }
+
+        regNumber argReg    = REG_NA;
+        regMaskTP argMask   = RBM_NONE;
+        short     regCount  = 0;
+        bool      isOnStack = true;
+        if (curArgTabEntry->regNum != REG_STK)
+        {
+            isOnStack         = false;
+            var_types argType = argNode->TypeGet();
+
+#if FEATURE_VARARG
+            callHasFloatRegArgs |= varTypeIsFloating(argType);
+#endif // !FEATURE_VARARG
+
+            argReg   = curArgTabEntry->regNum;
+            regCount = 1;
+
+            // Default case is that we consume one source; modify this later (e.g. for
+            // promoted structs)
+            info->srcCount++;
+
+            argMask = genRegMask(argReg);
+            argNode = argNode->gtEffectiveVal();
+        }
+
+        // If the struct arg is wrapped in CPYBLK the type of the param will be TYP_VOID.
+        // Use the curArgTabEntry's isStruct to get whether the param is a struct.
+        if (varTypeIsStruct(argNode) PUT_STRUCT_ARG_STK_ONLY(|| curArgTabEntry->isStruct))
+        {
+            unsigned   originalSize = 0;
+            LclVarDsc* varDsc       = nullptr;
+            if (argNode->gtOper == GT_LCL_VAR)
+            {
+                varDsc       = compiler->lvaTable + argNode->gtLclVarCommon.gtLclNum;
+                originalSize = varDsc->lvSize();
+            }
+            else if (argNode->gtOper == GT_MKREFANY)
+            {
+                originalSize = 2 * TARGET_POINTER_SIZE;
+            }
+            else if (argNode->gtOper == GT_OBJ)
+            {
+                noway_assert(!"GT_OBJ not supported for amd64");
+            }
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            else if (argNode->gtOper == GT_PUTARG_REG)
+            {
+                originalSize = genTypeSize(argNode->gtType);
+            }
+            else if (argNode->gtOper == GT_FIELD_LIST)
+            {
+                originalSize = 0;
+
+                // There could be up to 2 PUTARG_REGs in the list
+                GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
+                unsigned          iterationNum = 0;
+                for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest())
+                {
+                    GenTreePtr putArgRegNode = fieldListPtr->Current();
+                    assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+
+                    if (iterationNum == 0)
+                    {
+                        varDsc       = compiler->lvaTable + putArgRegNode->gtOp.gtOp1->gtLclVarCommon.gtLclNum;
+                        originalSize = varDsc->lvSize();
+                        assert(originalSize != 0);
+                    }
+                    else
+                    {
+                        // Need an extra source for every node, but the first in the list.
+                        info->srcCount++;
+
+                        // Get the mask for the second putarg_reg
+                        argMask = genRegMask(curArgTabEntry->otherRegNum);
+                    }
+
+                    putArgRegNode->gtLsraInfo.setDstCandidates(l, argMask);
+                    putArgRegNode->gtLsraInfo.setSrcCandidates(l, argMask);
+
+                    // To avoid redundant moves, have the argument child tree computed in the
+                    // register in which the argument is passed to the call.
+                    putArgRegNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(putArgRegNode));
+                    iterationNum++;
+                }
+
+                assert(iterationNum <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+            }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+            else
+            {
+                noway_assert(!"Can't predict unsupported TYP_STRUCT arg kind");
+            }
+
+            unsigned slots          = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES;
+            unsigned remainingSlots = slots;
+
+            if (!isOnStack)
+            {
+                remainingSlots = slots - 1;
+
+                regNumber reg = (regNumber)(argReg + 1);
+                while (remainingSlots > 0 && reg <= REG_ARG_LAST)
+                {
+                    argMask |= genRegMask(reg);
+                    reg = (regNumber)(reg + 1);
+                    remainingSlots--;
+                    regCount++;
+                }
+            }
+
+            short internalIntCount = 0;
+            if (remainingSlots > 0)
+            {
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                // This TYP_STRUCT argument is also passed in the outgoing argument area
+                // We need a register to address the TYP_STRUCT
+                internalIntCount = 1;
+#else  // FEATURE_UNIX_AMD64_STRUCT_PASSING
+                // And we may need 2
+                internalIntCount           = 2;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+            }
+            argNode->gtLsraInfo.internalIntCount = internalIntCount;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (argNode->gtOper == GT_PUTARG_REG)
+            {
+                argNode->gtLsraInfo.setDstCandidates(l, argMask);
+                argNode->gtLsraInfo.setSrcCandidates(l, argMask);
+            }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+        }
+        else
+        {
+            argNode->gtLsraInfo.setDstCandidates(l, argMask);
+            argNode->gtLsraInfo.setSrcCandidates(l, argMask);
+        }
+
+        // To avoid redundant moves, have the argument child tree computed in the
+        // register in which the argument is passed to the call.
+        if (argNode->gtOper == GT_PUTARG_REG)
+        {
+            argNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(argNode));
+        }
+
+#if FEATURE_VARARG
+        // In the case of a varargs call, the ABI dictates that if we have floating point args,
+        // we must pass the enregistered arguments in both the integer and floating point registers.
+        // Since the integer register is not associated with this arg node, we will reserve it as
+        // an internal register so that it is not used during the evaluation of the call node
+        // (e.g. for the target).
+        if (call->IsVarargs() && varTypeIsFloating(argNode))
+        {
+            regNumber targetReg = compiler->getCallArgIntRegister(argReg);
+            info->setInternalIntCount(info->internalIntCount + 1);
+            info->addInternalCandidates(l, genRegMask(targetReg));
+        }
+#endif // FEATURE_VARARG
+    }
+
+    // Now, count stack args
+    // Note that these need to be computed into a register, but then
+    // they're just stored to the stack - so the reg doesn't
+    // need to remain live until the call.  In fact, it must not
+    // because the code generator doesn't actually consider it live,
+    // so it can't be spilled.
+
+    GenTreePtr args = call->gtCallArgs;
+    while (args)
+    {
+        GenTreePtr arg = args->gtOp.gtOp1;
+        if (!(args->gtFlags & GTF_LATE_ARG))
+        {
+            TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
+            if (argInfo->dstCount != 0)
+            {
+                argInfo->isLocalDefUse = true;
+            }
+
+            // If the child of GT_PUTARG_STK is a constant, we don't need a register to
+            // move it to memory (stack location).
+            //
+            // On AMD64, we don't want to make 0 contained, because we can generate smaller code
+            // by zeroing a register and then storing it. E.g.:
+            //      xor rdx, rdx
+            //      mov gword ptr [rsp+28H], rdx
+            // is 2 bytes smaller than:
+            //      mov gword ptr [rsp+28H], 0
+            //
+            // On x86, we push stack arguments; we don't use 'mov'. So:
+            //      push 0
+            // is 1 byte smaller than:
+            //      xor rdx, rdx
+            //      push rdx
+
+            argInfo->dstCount = 0;
+            if (arg->gtOper == GT_PUTARG_STK)
+            {
+                GenTree* op1 = arg->gtOp.gtOp1;
+                if (IsContainableImmed(arg, op1)
+#if defined(_TARGET_AMD64_)
+                    && !op1->IsIntegralConst(0)
+#endif // _TARGET_AMD64_
+                        )
+                {
+                    MakeSrcContained(arg, op1);
+                }
+            }
+        }
+        args = args->gtOp.gtOp2;
+    }
+
+#if FEATURE_VARARG
+    // If it is a fast tail call, it is already preferenced to use RAX.
+    // Therefore, no need set src candidates on call tgt again.
+    if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExpr != nullptr))
+    {
+        // Don't assign the call target to any of the argument registers because
+        // we will use them to also pass floating point arguments as required
+        // by Amd64 ABI.
+        ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS));
+    }
+#endif // !FEATURE_VARARG
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitBlockStore: Set the NodeInfo for a block store.
+//
+// Arguments:
+//    blkNode       - The block store node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
+{
+    GenTree*    dstAddr  = blkNode->Addr();
+    unsigned    size     = blkNode->gtBlkSize;
+    GenTree*    source   = blkNode->Data();
+    LinearScan* l        = m_lsra;
+    Compiler*   compiler = comp;
+
+    // Sources are dest address, initVal or source.
+    // We may require an additional source or temp register for the size.
+    blkNode->gtLsraInfo.srcCount = 2;
+    blkNode->gtLsraInfo.dstCount = 0;
+    blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE);
+    GenTreePtr srcAddrOrFill = nullptr;
+    bool       isInitBlk     = blkNode->OperIsInitBlkOp();
+
+    regMaskTP dstAddrRegMask = RBM_NONE;
+    regMaskTP sourceRegMask  = RBM_NONE;
+    regMaskTP blkSizeRegMask = RBM_NONE;
+
+    if (isInitBlk)
+    {
+        GenTree* initVal = source;
+        if (initVal->OperIsInitVal())
+        {
+            initVal = initVal->gtGetOp1();
+        }
+        srcAddrOrFill = initVal;
+
+        switch (blkNode->gtBlkOpKind)
+        {
+            case GenTreeBlk::BlkOpKindUnroll:
+                assert(initVal->IsCnsIntOrI());
+                if (size >= XMM_REGSIZE_BYTES)
+                {
+                    // Reserve an XMM register to fill it with
+                    // a pack of 16 init value constants.
+                    ssize_t fill                           = initVal->gtIntCon.gtIconVal & 0xFF;
+                    blkNode->gtLsraInfo.internalFloatCount = 1;
+                    blkNode->gtLsraInfo.setInternalCandidates(l, l->internalFloatRegCandidates());
+                    if ((fill == 0) && ((size & 0xf) == 0))
+                    {
+                        MakeSrcContained(blkNode, source);
+                    }
+                    // use XMM register to fill with constants, it's AVX instruction and set the flag
+                    SetContainsAVXFlags();
+                }
+#ifdef _TARGET_X86_
+                if ((size & 1) != 0)
+                {
+                    // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
+                    // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
+                    // when unrolling, so only allow byteable registers as the source value. (We could
+                    // consider just using BlkOpKindRepInstr instead.)
+                    sourceRegMask = RBM_BYTE_REGS;
+                }
+#endif // _TARGET_X86_
+                break;
+
+            case GenTreeBlk::BlkOpKindRepInstr:
+                // rep stos has the following register requirements:
+                // a) The memory address to be in RDI.
+                // b) The fill value has to be in RAX.
+                // c) The buffer size will go in RCX.
+                dstAddrRegMask = RBM_RDI;
+                srcAddrOrFill  = initVal;
+                sourceRegMask  = RBM_RAX;
+                blkSizeRegMask = RBM_RCX;
+                break;
+
+            case GenTreeBlk::BlkOpKindHelper:
+#ifdef _TARGET_AMD64_
+                // The helper follows the regular AMD64 ABI.
+                dstAddrRegMask = RBM_ARG_0;
+                sourceRegMask  = RBM_ARG_1;
+                blkSizeRegMask = RBM_ARG_2;
+#else  // !_TARGET_AMD64_
+                dstAddrRegMask             = RBM_RDI;
+                sourceRegMask              = RBM_RAX;
+                blkSizeRegMask             = RBM_RCX;
+#endif // !_TARGET_AMD64_
+                break;
+
+            default:
+                unreached();
+        }
+    }
+    else
+    {
+        // CopyObj or CopyBlk
+        if (source->gtOper == GT_IND)
+        {
+            srcAddrOrFill = blkNode->Data()->gtGetOp1();
+            // We're effectively setting source as contained, but can't call MakeSrcContained, because the
+            // "inheritance" of the srcCount is to a child not a parent - it would "just work" but could be misleading.
+            // If srcAddr is already non-contained, we don't need to change it.
+            if (srcAddrOrFill->gtLsraInfo.getDstCount() == 0)
+            {
+                srcAddrOrFill->gtLsraInfo.setDstCount(1);
+                srcAddrOrFill->gtLsraInfo.setSrcCount(source->gtLsraInfo.srcCount);
+            }
+            m_lsra->clearOperandCounts(source);
+        }
+        else if (!source->IsMultiRegCall() && !source->OperIsSIMD())
+        {
+            assert(source->IsLocal());
+            MakeSrcContained(blkNode, source);
+        }
+        if (blkNode->OperGet() == GT_STORE_OBJ)
+        {
+            if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
+            {
+                // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
+                blkSizeRegMask = RBM_RCX;
+            }
+            // The srcAddr must be in a register.  If it was under a GT_IND, we need to subsume all of its
+            // sources.
+            sourceRegMask  = RBM_RSI;
+            dstAddrRegMask = RBM_RDI;
+        }
+        else
+        {
+            switch (blkNode->gtBlkOpKind)
+            {
+                case GenTreeBlk::BlkOpKindUnroll:
+                    // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
+                    //
+                    // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
+                    // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
+                    // RBM_NON_BYTE_REGS from internal candidates.
+                    if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
+                    {
+                        blkNode->gtLsraInfo.internalIntCount++;
+                        regMaskTP regMask = l->allRegs(TYP_INT);
+
+#ifdef _TARGET_X86_
+                        if ((size & 1) != 0)
+                        {
+                            regMask &= ~RBM_NON_BYTE_REGS;
+                        }
+#endif
+                        blkNode->gtLsraInfo.setInternalCandidates(l, regMask);
+                    }
+
+                    if (size >= XMM_REGSIZE_BYTES)
+                    {
+                        // If we have a buffer larger than XMM_REGSIZE_BYTES,
+                        // reserve an XMM register to use it for a
+                        // series of 16-byte loads and stores.
+                        blkNode->gtLsraInfo.internalFloatCount = 1;
+                        blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
+                        // Uses XMM reg for load and store and hence check to see whether AVX instructions
+                        // are used for codegen, set ContainsAVX flag
+                        SetContainsAVXFlags();
+                    }
+                    // If src or dst are on stack, we don't have to generate the address
+                    // into a register because it's just some constant+SP.
+                    if ((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr())
+                    {
+                        MakeSrcContained(blkNode, srcAddrOrFill);
+                    }
+
+                    if (dstAddr->OperIsLocalAddr())
+                    {
+                        MakeSrcContained(blkNode, dstAddr);
+                    }
+
+                    break;
+
+                case GenTreeBlk::BlkOpKindRepInstr:
+                    // rep stos has the following register requirements:
+                    // a) The dest address has to be in RDI.
+                    // b) The src address has to be in RSI.
+                    // c) The buffer size will go in RCX.
+                    dstAddrRegMask = RBM_RDI;
+                    sourceRegMask  = RBM_RSI;
+                    blkSizeRegMask = RBM_RCX;
+                    break;
+
+                case GenTreeBlk::BlkOpKindHelper:
+#ifdef _TARGET_AMD64_
+                    // The helper follows the regular AMD64 ABI.
+                    dstAddrRegMask = RBM_ARG_0;
+                    sourceRegMask  = RBM_ARG_1;
+                    blkSizeRegMask = RBM_ARG_2;
+#else  // !_TARGET_AMD64_
+                    dstAddrRegMask         = RBM_RDI;
+                    sourceRegMask          = RBM_RAX;
+                    blkSizeRegMask         = RBM_RCX;
+#endif // !_TARGET_AMD64_
+                    break;
+
+                default:
+                    unreached();
+            }
+        }
+    }
+
+    if (dstAddrRegMask != RBM_NONE)
+    {
+        dstAddr->gtLsraInfo.setSrcCandidates(l, dstAddrRegMask);
+    }
+    if (sourceRegMask != RBM_NONE)
+    {
+        if (srcAddrOrFill != nullptr)
+        {
+            srcAddrOrFill->gtLsraInfo.setSrcCandidates(l, sourceRegMask);
+        }
+        else
+        {
+            // This is a local source; we'll use a temp register for its address.
+            blkNode->gtLsraInfo.addInternalCandidates(l, sourceRegMask);
+            blkNode->gtLsraInfo.internalIntCount++;
+        }
+    }
+    if (blkSizeRegMask != RBM_NONE)
+    {
+        if (size != 0)
+        {
+            // Reserve a temp register for the block size argument.
+            blkNode->gtLsraInfo.addInternalCandidates(l, blkSizeRegMask);
+            blkNode->gtLsraInfo.internalIntCount++;
+        }
+        else
+        {
+            // The block size argument is a third argument to GT_STORE_DYN_BLK
+            noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK);
+            blkNode->gtLsraInfo.setSrcCount(3);
+            GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
+            blockSize->gtLsraInfo.setSrcCandidates(l, blkSizeRegMask);
+        }
+    }
+}
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+//------------------------------------------------------------------------
+// TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
+{
+    TreeNodeInfo* info = &(putArgStk->gtLsraInfo);
+    LinearScan*   l    = m_lsra;
+    info->srcCount     = 0;
+
+#ifdef _TARGET_X86_
+    if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
+    {
+        unsigned fieldCount    = 0;
+        bool     needsByteTemp = false;
+        bool     needsSimdTemp = false;
+        unsigned prevOffset    = putArgStk->getArgSize();
+        for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
+        {
+            GenTree* const  fieldNode   = current->Current();
+            const var_types fieldType   = fieldNode->TypeGet();
+            const unsigned  fieldOffset = current->gtFieldOffset;
+            assert(fieldType != TYP_LONG);
+            info->srcCount++;
+
+            // For x86 we must mark all integral fields as contained or reg-optional, and handle them
+            // accordingly in code generation, since we may have up to 8 fields, which cannot all be in
+            // registers to be consumed atomically by the call.
+            if (varTypeIsIntegralOrI(fieldNode))
+            {
+                if (fieldNode->OperGet() == GT_LCL_VAR)
+                {
+                    LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]);
+                    if (varDsc->lvTracked && !varDsc->lvDoNotEnregister)
+                    {
+                        SetRegOptional(fieldNode);
+                    }
+                    else
+                    {
+                        MakeSrcContained(putArgStk, fieldNode);
+                    }
+                }
+                else if (fieldNode->IsIntCnsFitsInI32())
+                {
+                    MakeSrcContained(putArgStk, fieldNode);
+                }
+                else
+                {
+                    // For the case where we cannot directly push the value, if we run out of registers,
+                    // it would be better to defer computation until we are pushing the arguments rather
+                    // than spilling, but this situation is not all that common, as most cases of promoted
+                    // structs do not have a large number of fields, and of those most are lclVars or
+                    // copy-propagated constants.
+                    SetRegOptional(fieldNode);
+                }
+            }
+#if defined(FEATURE_SIMD)
+            // Note that we need to check the GT_FIELD_LIST type, not the fieldType. This is because the
+            // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
+            // we "round up" to 16.
+            else if (current->gtFieldType == TYP_SIMD12)
+            {
+                needsSimdTemp = true;
+            }
+#endif // defined(FEATURE_SIMD)
+            else
+            {
+                assert(varTypeIsFloating(fieldNode) || varTypeIsSIMD(fieldNode));
+            }
+
+            // We can treat as a slot any field that is stored at a slot boundary, where the previous
+            // field is not in the same slot. (Note that we store the fields in reverse order.)
+            const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
+            if (!fieldIsSlot)
+            {
+                if (varTypeIsByte(fieldType))
+                {
+                    // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
+                    // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
+                    // need a byte-addressable register for the store. We will enforce this requirement on an internal
+                    // register, which we can use to copy multiple byte values.
+                    needsByteTemp = true;
+                }
+            }
+
+            if (varTypeIsGC(fieldType))
+            {
+                putArgStk->gtNumberReferenceSlots++;
+            }
+            prevOffset = fieldOffset;
+            fieldCount++;
+        }
+
+        info->dstCount = 0;
+
+        if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
+        {
+            // If any of the fields cannot be stored with an actual push, we may need a temporary
+            // register to load the value before storing it to the stack location.
+            info->internalIntCount = 1;
+            regMaskTP regMask      = l->allRegs(TYP_INT);
+            if (needsByteTemp)
+            {
+                regMask &= ~RBM_NON_BYTE_REGS;
+            }
+            info->setInternalCandidates(l, regMask);
+        }
+
+#if defined(FEATURE_SIMD)
+        // For PutArgStk of a TYP_SIMD12, we need a SIMD temp register.
+        if (needsSimdTemp)
+        {
+            info->internalFloatCount += 1;
+            info->addInternalCandidates(l, l->allSIMDRegs());
+        }
+#endif // defined(FEATURE_SIMD)
+
+        return;
+    }
+#endif // _TARGET_X86_
+
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+    // For PutArgStk of a TYP_SIMD12, we need an extra register.
+    if (putArgStk->TypeGet() == TYP_SIMD12)
+    {
+        info->srcCount           = putArgStk->gtOp1->gtLsraInfo.dstCount;
+        info->dstCount           = 0;
+        info->internalFloatCount = 1;
+        info->setInternalCandidates(l, l->allSIMDRegs());
+        return;
+    }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+
+    if (putArgStk->TypeGet() != TYP_STRUCT)
+    {
+        TreeNodeInfoInitSimple(putArgStk);
+        return;
+    }
+
+    GenTreePtr dst     = putArgStk;
+    GenTreePtr src     = putArgStk->gtOp1;
+    GenTreePtr srcAddr = nullptr;
+
+    bool haveLocalAddr = false;
+    if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
+    {
+        srcAddr = src->gtOp.gtOp1;
+        assert(srcAddr != nullptr);
+        haveLocalAddr = srcAddr->OperIsLocalAddr();
+    }
+    else
+    {
+        assert(varTypeIsSIMD(putArgStk));
+    }
+
+    info->srcCount = src->gtLsraInfo.dstCount;
+    info->dstCount = 0;
+
+    // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
+    // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
+    // our framework assemblies, so this is the main code generation scheme we'll use.
+    ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
+    switch (putArgStk->gtPutArgStkKind)
+    {
+        case GenTreePutArgStk::Kind::Push:
+        case GenTreePutArgStk::Kind::PushAllSlots:
+        case GenTreePutArgStk::Kind::Unroll:
+            // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
+            //
+            // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
+            // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
+            // RBM_NON_BYTE_REGS from internal candidates.
+            if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0)
+            {
+                info->internalIntCount++;
+                regMaskTP regMask = l->allRegs(TYP_INT);
+
+#ifdef _TARGET_X86_
+                if ((size % 2) != 0)
+                {
+                    regMask &= ~RBM_NON_BYTE_REGS;
+                }
+#endif
+                info->setInternalCandidates(l, regMask);
+            }
+
+#ifdef _TARGET_X86_
+            if (size >= 8)
+#else  // !_TARGET_X86_
+            if (size >= XMM_REGSIZE_BYTES)
+#endif // !_TARGET_X86_
+            {
+                // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
+                // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
+                // series of 16-byte loads and stores.
+                info->internalFloatCount = 1;
+                info->addInternalCandidates(l, l->internalFloatRegCandidates());
+                SetContainsAVXFlags();
+            }
+            break;
+
+        case GenTreePutArgStk::Kind::RepInstr:
+            info->internalIntCount += 3;
+            info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI));
+            break;
+
+        default:
+            unreached();
+    }
+
+    // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
+    MakeSrcContained(putArgStk, src);
+
+    if (haveLocalAddr)
+    {
+        // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
+        // copies.
+        //
+        // To avoid an assertion in MakeSrcContained, increment the parent's source count beforehand and decrement it
+        // afterwards.
+        info->srcCount++;
+        MakeSrcContained(putArgStk, srcAddr);
+        info->srcCount--;
+    }
+}
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitLclHeap: Set the NodeInfo for a GT_LCLHEAP.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
+{
+    TreeNodeInfo* info     = &(tree->gtLsraInfo);
+    LinearScan*   l        = m_lsra;
+    Compiler*     compiler = comp;
+
+    info->srcCount = 1;
+    info->dstCount = 1;
+
+    // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
+    // Here '-' means don't care.
+    //
+    //     Size?                    Init Memory?         # temp regs
+    //      0                            -                  0 (returns 0)
+    //      const and <=6 reg words      -                  0 (pushes '0')
+    //      const and >6 reg words       Yes                0 (pushes '0')
+    //      const and <PageSize          No                 0 (amd64) 1 (x86)
+    //                                                        (x86:tmpReg for sutracting from esp)
+    //      const and >=PageSize         No                 2 (regCnt and tmpReg for subtracing from sp)
+    //      Non-const                    Yes                0 (regCnt=targetReg and pushes '0')
+    //      Non-const                    No                 2 (regCnt and tmpReg for subtracting from sp)
+    //
+    // Note: Here we don't need internal register to be different from targetReg.
+    // Rather, require it to be different from operand's reg.
+
+    GenTreePtr size = tree->gtOp.gtOp1;
+    if (size->IsCnsIntOrI())
+    {
+        MakeSrcContained(tree, size);
+
+        size_t sizeVal = size->gtIntCon.gtIconVal;
+
+        if (sizeVal == 0)
+        {
+            info->internalIntCount = 0;
+        }
+        else
+        {
+            // Compute the amount of memory to properly STACK_ALIGN.
+            // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
+            // This should also help in debugging as we can examine the original size specified with localloc.
+            sizeVal = AlignUp(sizeVal, STACK_ALIGN);
+
+            // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
+            // we will generate 'push 0'.
+            assert((sizeVal % REGSIZE_BYTES) == 0);
+            size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
+            if (cntRegSizedWords <= 6)
+            {
+                info->internalIntCount = 0;
+            }
+            else if (!compiler->info.compInitMem)
+            {
+                // No need to initialize allocated stack space.
+                if (sizeVal < compiler->eeGetPageSize())
+                {
+#ifdef _TARGET_X86_
+                    info->internalIntCount = 1; // x86 needs a register here to avoid generating "sub" on ESP.
+#else                                           // !_TARGET_X86_
+                    info->internalIntCount = 0;
+#endif                                          // !_TARGET_X86_
+                }
+                else
+                {
+                    // We need two registers: regCnt and RegTmp
+                    info->internalIntCount = 2;
+                }
+            }
+            else
+            {
+                // >6 and need to zero initialize allocated stack space.
+                info->internalIntCount = 0;
+            }
+        }
+    }
+    else
+    {
+        if (!compiler->info.compInitMem)
+        {
+            info->internalIntCount = 2;
+        }
+        else
+        {
+            info->internalIntCount = 0;
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitLogicalOp: Set the NodeInfo for GT_AND/GT_OR/GT_XOR,
+// as well as GT_ADD/GT_SUB.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitLogicalOp(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+    LinearScan*   l    = m_lsra;
+
+    // We're not marking a constant hanging on the left of the add
+    // as containable so we assign it to a register having CQ impact.
+    // TODO-XArch-CQ: Detect this case and support both generating a single instruction
+    // for GT_ADD(Constant, SomeTree)
+    info->srcCount = 2;
+    info->dstCount = 1;
+
+    GenTree* op1 = tree->gtGetOp1();
+    GenTree* op2 = tree->gtGetOp2();
+
+    // We can directly encode the second operand if it is either a containable constant or a memory-op.
+    // In case of memory-op, we can encode it directly provided its type matches with 'tree' type.
+    // This is because during codegen, type of 'tree' is used to determine emit Type size. If the types
+    // do not match, they get normalized (i.e. sign/zero extended) on load into a register.
+    bool       directlyEncodable = false;
+    bool       binOpInRMW        = false;
+    GenTreePtr operand           = nullptr;
+
+    if (IsContainableImmed(tree, op2))
+    {
+        directlyEncodable = true;
+        operand           = op2;
+    }
+    else
+    {
+        binOpInRMW = IsBinOpInRMWStoreInd(tree);
+        if (!binOpInRMW)
+        {
+            if (op2->isMemoryOp() && tree->TypeGet() == op2->TypeGet())
+            {
+                directlyEncodable = true;
+                operand           = op2;
+            }
+            else if (tree->OperIsCommutative())
+            {
+                if (IsContainableImmed(tree, op1) ||
+                    (op1->isMemoryOp() && tree->TypeGet() == op1->TypeGet() && IsSafeToContainMem(tree, op1)))
+                {
+                    // If it is safe, we can reverse the order of operands of commutative operations for efficient
+                    // codegen
+                    directlyEncodable = true;
+                    operand           = op1;
+                }
+            }
+        }
+    }
+
+    if (directlyEncodable)
+    {
+        assert(operand != nullptr);
+        MakeSrcContained(tree, operand);
+    }
+    else if (!binOpInRMW)
+    {
+        // If this binary op neither has contained operands, nor is a
+        // Read-Modify-Write (RMW) operation, we can mark its operands
+        // as reg optional.
+        SetRegOptionalForBinOp(tree);
+    }
+
+    // Codegen of this tree node sets ZF and SF flags.
+    tree->gtFlags |= GTF_ZSF_SET;
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitModDiv(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+    LinearScan*   l    = m_lsra;
+
+    GenTree* op1 = tree->gtGetOp1();
+    GenTree* op2 = tree->gtGetOp2();
+
+    info->srcCount = 2;
+    info->dstCount = 1;
+
+    switch (tree->OperGet())
+    {
+        case GT_MOD:
+        case GT_DIV:
+            if (varTypeIsFloating(tree->TypeGet()))
+            {
+                // No implicit conversions at this stage as the expectation is that
+                // everything is made explicit by adding casts.
+                assert(op1->TypeGet() == op2->TypeGet());
+
+                if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
+                {
+                    MakeSrcContained(tree, op2);
+                }
+                else
+                {
+                    // If there are no containable operands, we can make an operand reg optional.
+                    // SSE2 allows only op2 to be a memory-op.
+                    SetRegOptional(op2);
+                }
+
+                return;
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    // Amd64 Div/Idiv instruction:
+    //    Dividend in RAX:RDX  and computes
+    //    Quotient in RAX, Remainder in RDX
+
+    if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
+    {
+        // We are interested in just the remainder.
+        // RAX is used as a trashable register during computation of remainder.
+        info->setDstCandidates(l, RBM_RDX);
+    }
+    else
+    {
+        // We are interested in just the quotient.
+        // RDX gets used as trashable register during computation of quotient
+        info->setDstCandidates(l, RBM_RAX);
+    }
+
+    bool op2CanBeRegOptional = true;
+#ifdef _TARGET_X86_
+    if (op1->OperGet() == GT_LONG)
+    {
+        // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
+        GenTree* loVal = op1->gtGetOp1();
+        GenTree* hiVal = op1->gtGetOp2();
+
+        // Src count is actually 3, so increment.
+        assert(op2->IsCnsIntOrI());
+        assert(tree->OperGet() == GT_UMOD);
+        info->srcCount++;
+        op2CanBeRegOptional = false;
+
+        // This situation also requires an internal register.
+        info->internalIntCount = 1;
+        info->setInternalCandidates(l, l->allRegs(TYP_INT));
+
+        loVal->gtLsraInfo.setSrcCandidates(l, RBM_EAX);
+        hiVal->gtLsraInfo.setSrcCandidates(l, RBM_EDX);
+    }
+    else
+#endif
+    {
+        // If possible would like to have op1 in RAX to avoid a register move
+        op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+    }
+
+    // divisor can be an r/m, but the memory indirection must be of the same size as the divide
+    if (op2->isMemoryOp() && (op2->TypeGet() == tree->TypeGet()))
+    {
+        MakeSrcContained(tree, op2);
+    }
+    else if (op2CanBeRegOptional)
+    {
+        op2->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
+
+        // If there are no containable operands, we can make an operand reg optional.
+        // Div instruction allows only op2 to be a memory op.
+        SetRegOptional(op2);
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitIntrinsic(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+    LinearScan*   l    = m_lsra;
+
+    // Both operand and its result must be of floating point type.
+    GenTree* op1 = tree->gtGetOp1();
+    assert(varTypeIsFloating(op1));
+    assert(op1->TypeGet() == tree->TypeGet());
+
+    info->srcCount = 1;
+    info->dstCount = 1;
+
+    switch (tree->gtIntrinsic.gtIntrinsicId)
+    {
+        case CORINFO_INTRINSIC_Sqrt:
+            if (op1->isMemoryOp() || op1->IsCnsNonZeroFltOrDbl())
+            {
+                MakeSrcContained(tree, op1);
+            }
+            else
+            {
+                // Mark the operand as reg optional since codegen can still
+                // generate code if op1 is on stack.
+                SetRegOptional(op1);
+            }
+            break;
+
+        case CORINFO_INTRINSIC_Abs:
+            // Abs(float x) = x & 0x7fffffff
+            // Abs(double x) = x & 0x7ffffff ffffffff
+
+            // In case of Abs we need an internal register to hold mask.
+
+            // TODO-XArch-CQ: avoid using an internal register for the mask.
+            // Andps or andpd both will operate on 128-bit operands.
+            // The data section constant to hold the mask is a 64-bit size.
+            // Therefore, we need both the operand and mask to be in
+            // xmm register. When we add support in emitter to emit 128-bit
+            // data constants and instructions that operate on 128-bit
+            // memory operands we can avoid the need for an internal register.
+            if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
+            {
+                info->internalFloatCount = 1;
+                info->setInternalCandidates(l, l->internalFloatRegCandidates());
+            }
+            break;
+
+#ifdef _TARGET_X86_
+        case CORINFO_INTRINSIC_Cos:
+        case CORINFO_INTRINSIC_Sin:
+        case CORINFO_INTRINSIC_Round:
+            NYI_X86("Math intrinsics Cos, Sin and Round");
+            break;
+#endif // _TARGET_X86_
+
+        default:
+            // Right now only Sqrt/Abs are treated as math intrinsics
+            noway_assert(!"Unsupported math intrinsic");
+            unreached();
+            break;
+    }
+}
+
+#ifdef FEATURE_SIMD
+//------------------------------------------------------------------------
+// TreeNodeInfoInitSIMD: Set the NodeInfo for a GT_SIMD tree.
+//
+// Arguments:
+//    tree       - The GT_SIMD node of interest
+//
+// Return Value:
+//    None.
+
+void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
+{
+    GenTreeSIMD*  simdTree = tree->AsSIMD();
+    TreeNodeInfo* info     = &(tree->gtLsraInfo);
+    LinearScan*   lsra     = m_lsra;
+    info->dstCount         = 1;
+    SetContainsAVXFlags(true, simdTree->gtSIMDSize);
+    switch (simdTree->gtSIMDIntrinsicID)
+    {
+        GenTree* op1;
+        GenTree* op2;
+
+        case SIMDIntrinsicInit:
+        {
+            op1 = tree->gtOp.gtOp1;
+
+#if !defined(_TARGET_64BIT_)
+            if (op1->OperGet() == GT_LONG)
+            {
+                info->srcCount = 2;
+            }
+            else
+#endif // !defined(_TARGET_64BIT_)
+            {
+                info->srcCount = 1;
+            }
+
+            // This sets all fields of a SIMD struct to the given value.
+            // Mark op1 as contained if it is either zero or int constant of all 1's,
+            // or a float constant with 16 or 32 byte simdType (AVX case)
+            //
+            // Should never see small int base type vectors except for zero initialization.
+            assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
+
+#if !defined(_TARGET_64BIT_)
+            if (op1->OperGet() == GT_LONG)
+            {
+                GenTree* op1lo = op1->gtGetOp1();
+                GenTree* op1hi = op1->gtGetOp2();
+
+                if ((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
+                    (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)))
+                {
+                    assert(op1->gtLsraInfo.srcCount == 0);
+                    assert(op1->gtLsraInfo.dstCount == 0);
+                    assert(op1lo->gtLsraInfo.srcCount == 0);
+                    assert(op1lo->gtLsraInfo.dstCount == 1);
+                    assert(op1hi->gtLsraInfo.srcCount == 0);
+                    assert(op1hi->gtLsraInfo.dstCount == 1);
+
+                    op1lo->gtLsraInfo.dstCount = 0;
+                    op1hi->gtLsraInfo.dstCount = 0;
+                    info->srcCount             = 0;
+                }
+                else
+                {
+                    // need a temp
+                    info->internalFloatCount = 1;
+                    info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+                    info->isInternalRegDelayFree = true;
+                }
+            }
+            else
+#endif // !defined(_TARGET_64BIT_)
+                if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
+                    (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
+            {
+                MakeSrcContained(tree, op1);
+                info->srcCount = 0;
+            }
+            else if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) &&
+                     ((simdTree->gtSIMDSize == 16) || (simdTree->gtSIMDSize == 32)))
+            {
+                // Either op1 is a float or dbl constant or an addr
+                if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
+                {
+                    MakeSrcContained(tree, op1);
+                    info->srcCount = 0;
+                }
+            }
+        }
+        break;
+
+        case SIMDIntrinsicInitN:
+        {
+            info->srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(simdTree->gtSIMDBaseType));
+
+            // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
+            info->internalFloatCount = 1;
+            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+        }
+        break;
+
+        case SIMDIntrinsicInitArray:
+            // We have an array and an index, which may be contained.
+            info->srcCount = 2;
+            CheckImmedAndMakeContained(tree, tree->gtGetOp2());
+            break;
+
+        case SIMDIntrinsicDiv:
+            // SSE2 has no instruction support for division on integer vectors
+            noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
+            info->srcCount = 2;
+            break;
+
+        case SIMDIntrinsicAbs:
+            // float/double vectors: This gets implemented as bitwise-And operation
+            // with a mask and hence should never see  here.
+            //
+            // Must be a Vector<int> or Vector<short> Vector<sbyte>
+            assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
+                   simdTree->gtSIMDBaseType == TYP_BYTE);
+            assert(comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4);
+            info->srcCount = 1;
+            break;
+
+        case SIMDIntrinsicSqrt:
+            // SSE2 has no instruction support for sqrt on integer vectors.
+            noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
+            info->srcCount = 1;
+            break;
+
+        case SIMDIntrinsicAdd:
+        case SIMDIntrinsicSub:
+        case SIMDIntrinsicMul:
+        case SIMDIntrinsicBitwiseAnd:
+        case SIMDIntrinsicBitwiseAndNot:
+        case SIMDIntrinsicBitwiseOr:
+        case SIMDIntrinsicBitwiseXor:
+        case SIMDIntrinsicMin:
+        case SIMDIntrinsicMax:
+            info->srcCount = 2;
+
+            // SSE2 32-bit integer multiplication requires two temp regs
+            if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
+                comp->getSIMDInstructionSet() == InstructionSet_SSE2)
+            {
+                info->internalFloatCount = 2;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            }
+            break;
+
+        case SIMDIntrinsicEqual:
+            info->srcCount = 2;
+            break;
+
+        // SSE2 doesn't support < and <= directly on int vectors.
+        // Instead we need to use > and >= with swapped operands.
+        case SIMDIntrinsicLessThan:
+        case SIMDIntrinsicLessThanOrEqual:
+            info->srcCount = 2;
+            noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
+            break;
+
+        // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
+        // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
+        // Instead we need to use <  and <= with swapped operands.
+        case SIMDIntrinsicGreaterThan:
+            noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
+            info->srcCount = 2;
+            break;
+
+        case SIMDIntrinsicOpEquality:
+        case SIMDIntrinsicOpInEquality:
+            info->srcCount = 2;
+
+            // On SSE4/AVX, we can generate optimal code for (in)equality
+            // against zero using ptest. We can safely do this optimization
+            // for integral vectors but not for floating-point for the reason
+            // that we have +0.0 and -0.0 and +0.0 == -0.0
+            op2 = tree->gtGetOp2();
+            if ((comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0))
+            {
+                MakeSrcContained(tree, op2);
+            }
+            else
+            {
+                // Need one SIMD register as scratch.
+                // See genSIMDIntrinsicRelOp() for details on code sequence generated and
+                // the need for one scratch register.
+                //
+                // Note these intrinsics produce a BOOL result, hence internal float
+                // registers reserved are guaranteed to be different from target
+                // integer register without explicitly specifying.
+                info->internalFloatCount = 1;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            }
+            break;
+
+        case SIMDIntrinsicDotProduct:
+            // Float/Double vectors:
+            // For SSE, or AVX with 32-byte vectors, we also need an internal register
+            // as scratch. Further we need the targetReg and internal reg to be distinct
+            // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
+            // don't need a tmpReg.
+            //
+            // 32-byte integer vector on SSE4/AVX:
+            // will take advantage of phaddd, which operates only on 128-bit xmm reg.
+            // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
+            // registers since targetReg is an int type register.
+            //
+            // See genSIMDIntrinsicDotProduct() for details on code sequence generated
+            // and the need for scratch registers.
+            if (varTypeIsFloating(simdTree->gtSIMDBaseType))
+            {
+                if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) ||
+                    (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
+                {
+                    info->internalFloatCount     = 1;
+                    info->isInternalRegDelayFree = true;
+                    info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+                }
+                // else don't need scratch reg(s).
+            }
+            else
+            {
+                assert(simdTree->gtSIMDBaseType == TYP_INT && comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4);
+
+                // No need to set isInternalRegDelayFree since targetReg is a
+                // an int type reg and guaranteed to be different from xmm/ymm
+                // regs.
+                info->internalFloatCount = comp->canUseAVX() ? 2 : 1;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            }
+            info->srcCount = 2;
+            break;
+
+        case SIMDIntrinsicGetItem:
+        {
+            // This implements get_Item method. The sources are:
+            //  - the source SIMD struct
+            //  - index (which element to get)
+            // The result is baseType of SIMD struct.
+            info->srcCount = 2;
+            op1            = tree->gtOp.gtOp1;
+            op2            = tree->gtOp.gtOp2;
+
+            // If the index is a constant, mark it as contained.
+            if (CheckImmedAndMakeContained(tree, op2))
+            {
+                info->srcCount = 1;
+            }
+
+            if (op1->isMemoryOp())
+            {
+                MakeSrcContained(tree, op1);
+
+                // Although GT_IND of TYP_SIMD12 reserves an internal float
+                // register for reading 4 and 8 bytes from memory and
+                // assembling them into target XMM reg, it is not required
+                // in this case.
+                op1->gtLsraInfo.internalIntCount   = 0;
+                op1->gtLsraInfo.internalFloatCount = 0;
+            }
+            else
+            {
+                // If the index is not a constant, we will use the SIMD temp location to store the vector.
+                // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
+                // can use that in the process of extracting the element.
+                //
+                // If the index is a constant and base type is a small int we can use pextrw, but on AVX
+                // we will need a temp if are indexing into the upper half of the AVX register.
+                // In all other cases with constant index, we need a temp xmm register to extract the
+                // element if index is other than zero.
+
+                if (!op2->IsCnsIntOrI())
+                {
+                    (void)comp->getSIMDInitTempVarNum();
+                }
+                else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
+                {
+                    bool needFloatTemp;
+                    if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
+                        (comp->getSIMDInstructionSet() == InstructionSet_AVX))
+                    {
+                        int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
+                        needFloatTemp    = (byteShiftCnt >= 16);
+                    }
+                    else
+                    {
+                        needFloatTemp = !op2->IsIntegralConst(0);
+                    }
+
+                    if (needFloatTemp)
+                    {
+                        info->internalFloatCount = 1;
+                        info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+                    }
+                }
+            }
+        }
+        break;
+
+        case SIMDIntrinsicSetX:
+        case SIMDIntrinsicSetY:
+        case SIMDIntrinsicSetZ:
+        case SIMDIntrinsicSetW:
+            info->srcCount = 2;
+
+            // We need an internal integer register for SSE2 codegen
+            if (comp->getSIMDInstructionSet() == InstructionSet_SSE2)
+            {
+                info->internalIntCount = 1;
+                info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT));
+            }
+
+            break;
+
+        case SIMDIntrinsicCast:
+            info->srcCount = 1;
+            break;
+
+        case SIMDIntrinsicShuffleSSE2:
+            info->srcCount = 2;
+            // Second operand is an integer constant and marked as contained.
+            op2 = tree->gtOp.gtOp2;
+            noway_assert(op2->IsCnsIntOrI());
+            MakeSrcContained(tree, op2);
+            break;
+
+        case SIMDIntrinsicGetX:
+        case SIMDIntrinsicGetY:
+        case SIMDIntrinsicGetZ:
+        case SIMDIntrinsicGetW:
+        case SIMDIntrinsicGetOne:
+        case SIMDIntrinsicGetZero:
+        case SIMDIntrinsicGetCount:
+        case SIMDIntrinsicGetAllOnes:
+            assert(!"Get intrinsics should not be seen during Lowering.");
+            unreached();
+
+        default:
+            noway_assert(!"Unimplemented SIMD node type.");
+            unreached();
+    }
+}
+#endif // FEATURE_SIMD
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCast: Set the NodeInfo for a GT_CAST.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCast(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+    // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
+    //         see CodeGen::genIntToIntCast()
+
+    info->srcCount = 1;
+    info->dstCount = 1;
+
+    // Non-overflow casts to/from float/double are done using SSE2 instructions
+    // and that allow the source operand to be either a reg or memop. Given the
+    // fact that casts from small int to float/double are done as two-level casts,
+    // the source operand is always guaranteed to be of size 4 or 8 bytes.
+    var_types  castToType = tree->CastToType();
+    GenTreePtr castOp     = tree->gtCast.CastOp();
+    var_types  castOpType = castOp->TypeGet();
+    if (tree->gtFlags & GTF_UNSIGNED)
+    {
+        castOpType = genUnsignedType(castOpType);
+    }
+
+    if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType)))
+    {
+#ifdef DEBUG
+        // If converting to float/double, the operand must be 4 or 8 byte in size.
+        if (varTypeIsFloating(castToType))
+        {
+            unsigned opSize = genTypeSize(castOpType);
+            assert(opSize == 4 || opSize == 8);
+        }
+#endif // DEBUG
+
+        // U8 -> R8 conversion requires that the operand be in a register.
+        if (castOpType != TYP_ULONG)
+        {
+            if (castOp->isMemoryOp() || castOp->IsCnsNonZeroFltOrDbl())
+            {
+                MakeSrcContained(tree, castOp);
+            }
+            else
+            {
+                // Mark castOp as reg optional to indicate codegen
+                // can still generate code if it is on stack.
+                SetRegOptional(castOp);
+            }
+        }
+    }
+
+#if !defined(_TARGET_64BIT_)
+    if (varTypeIsLong(castOpType))
+    {
+        noway_assert(castOp->OperGet() == GT_LONG);
+        info->srcCount = 2;
+    }
+#endif // !defined(_TARGET_64BIT_)
+
+    // some overflow checks need a temp reg:
+    //  - GT_CAST from INT64/UINT64 to UINT32
+    if (tree->gtOverflow() && (castToType == TYP_UINT))
+    {
+        if (genTypeSize(castOpType) == 8)
+        {
+            // Here we don't need internal register to be different from targetReg,
+            // rather require it to be different from operand's reg.
+            info->internalIntCount = 1;
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitGCWriteBarrier: Set the NodeInfo for a GT_STOREIND requiring a write barrier.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitGCWriteBarrier(GenTree* tree)
+{
+    assert(tree->OperGet() == GT_STOREIND);
+
+    GenTreeStoreInd* dst  = tree->AsStoreInd();
+    GenTreePtr       addr = dst->Addr();
+    GenTreePtr       src  = dst->Data();
+
+    if (addr->OperGet() == GT_LEA)
+    {
+        // In the case where we are doing a helper assignment, if the dst
+        // is an indir through an lea, we need to actually instantiate the
+        // lea in a register
+        GenTreeAddrMode* lea = addr->AsAddrMode();
+
+        int leaSrcCount = 0;
+        if (lea->HasBase())
+        {
+            leaSrcCount++;
+        }
+        if (lea->HasIndex())
+        {
+            leaSrcCount++;
+        }
+        lea->gtLsraInfo.srcCount = leaSrcCount;
+        lea->gtLsraInfo.dstCount = 1;
+    }
+
+    bool useOptimizedWriteBarrierHelper = false; // By default, assume no optimized write barriers.
+
+#if NOGC_WRITE_BARRIERS
+
+#if defined(_TARGET_X86_)
+
+    useOptimizedWriteBarrierHelper = true; // On x86, use the optimized write barriers by default.
+#ifdef DEBUG
+    GCInfo::WriteBarrierForm wbf = comp->codeGen->gcInfo.gcIsWriteBarrierCandidate(tree, src);
+    if (wbf == GCInfo::WBF_NoBarrier_CheckNotHeapInDebug) // This one is always a call to a C++ method.
+    {
+        useOptimizedWriteBarrierHelper = false;
+    }
+#endif
+
+    if (useOptimizedWriteBarrierHelper)
+    {
+        // Special write barrier:
+        // op1 (addr) goes into REG_WRITE_BARRIER (rdx) and
+        // op2 (src) goes into any int register.
+        addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER);
+        src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER_SRC);
+    }
+
+#else // !defined(_TARGET_X86_)
+#error "NOGC_WRITE_BARRIERS is not supported"
+#endif // !defined(_TARGET_X86_)
+
+#endif // NOGC_WRITE_BARRIERS
+
+    if (!useOptimizedWriteBarrierHelper)
+    {
+        // For the standard JIT Helper calls:
+        // op1 (addr) goes into REG_ARG_0 and
+        // op2 (src) goes into REG_ARG_1
+        addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0);
+        src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1);
+    }
+
+    // Both src and dst must reside in a register, which they should since we haven't set
+    // either of them as contained.
+    assert(addr->gtLsraInfo.dstCount == 1);
+    assert(src->gtLsraInfo.dstCount == 1);
+}
+
+//-----------------------------------------------------------------------------------------
+// TreeNodeInfoInitIndir: Specify register requirements for address expression of an indirection operation.
+//
+// Arguments:
+//    indirTree    -   GT_IND or GT_STOREIND gentree node
+//
+void Lowering::TreeNodeInfoInitIndir(GenTreePtr indirTree)
+{
+    assert(indirTree->isIndir());
+    // If this is the rhs of a block copy (i.e. non-enregisterable struct),
+    // it has no register requirements.
+    if (indirTree->TypeGet() == TYP_STRUCT)
+    {
+        return;
+    }
+
+    GenTreePtr    addr = indirTree->gtGetOp1();
+    TreeNodeInfo* info = &(indirTree->gtLsraInfo);
+
+    GenTreePtr base  = nullptr;
+    GenTreePtr index = nullptr;
+    unsigned   mul, cns;
+    bool       rev;
+
+#ifdef FEATURE_SIMD
+    // If indirTree is of TYP_SIMD12, don't mark addr as contained
+    // so that it always get computed to a register.  This would
+    // mean codegen side logic doesn't need to handle all possible
+    // addr expressions that could be contained.
+    //
+    // TODO-XArch-CQ: handle other addr mode expressions that could be marked
+    // as contained.
+    if (indirTree->TypeGet() == TYP_SIMD12)
+    {
+        // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
+        // To assemble the vector properly we would need an additional
+        // XMM register.
+        info->internalFloatCount = 1;
+
+        // In case of GT_IND we need an internal register different from targetReg and
+        // both of the registers are used at the same time.
+        if (indirTree->OperGet() == GT_IND)
+        {
+            info->isInternalRegDelayFree = true;
+        }
+
+        info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
+
+        return;
+    }
+#endif // FEATURE_SIMD
+
+    if ((indirTree->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0)
+    {
+        // The address of an indirection that requires its address in a reg.
+        // Skip any further processing that might otherwise make it contained.
+    }
+    else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
+    {
+        // These nodes go into an addr mode:
+        // - GT_CLS_VAR_ADDR turns into a constant.
+        // - GT_LCL_VAR_ADDR is a stack addr mode.
+
+        // make this contained, it turns into a constant that goes into an addr mode
+        MakeSrcContained(indirTree, addr);
+    }
+    else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))
+    {
+        // Amd64:
+        // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
+        // (i.e. those VSD calls for which stub addr is known during JIT compilation time).  In this case,
+        // VM requires us to pass stub addr in REG_VIRTUAL_STUB_PARAM - see LowerVirtualStubCall().  For
+        // that reason we cannot mark such an addr as contained.  Note that this is not an issue for
+        // indirect VSD calls since morphArgs() is explicitly materializing hidden param as a non-standard
+        // argument.
+        //
+        // Workaround:
+        // Note that LowerVirtualStubCall() sets addr->gtRegNum to REG_VIRTUAL_STUB_PARAM and Lowering::doPhase()
+        // sets destination candidates on such nodes and resets addr->gtRegNum to REG_NA before calling
+        // TreeNodeInfoInit(). Ideally we should set a flag on addr nodes that shouldn't be marked as contained
+        // (in LowerVirtualStubCall()), but we don't have any GTF_* flags left for that purpose.  As a workaround
+        // an explicit check is made here.
+        //
+        // On x86, direct VSD is done via a relative branch, and in fact it MUST be contained.
+        MakeSrcContained(indirTree, addr);
+    }
+    else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr))
+    {
+        MakeSrcContained(indirTree, addr);
+    }
+    else if (addr->gtOper == GT_ARR_ELEM)
+    {
+        // The GT_ARR_ELEM consumes all the indices and produces the offset.
+        // The array object lives until the mem access.
+        // We also consume the target register to which the address is
+        // computed
+
+        info->srcCount++;
+        assert(addr->gtLsraInfo.srcCount >= 2);
+        addr->gtLsraInfo.srcCount -= 1;
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCmp: Set the register requirements for a compare.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree)
+{
+    assert(tree->OperIsCompare());
+
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+    info->srcCount = 2;
+    info->dstCount = 1;
+
+#ifdef _TARGET_X86_
+    // If the compare is used by a jump, we just need to set the condition codes. If not, then we need
+    // to store the result into the low byte of a register, which requires the dst be a byteable register.
+    // We always set the dst candidates, though, because if this is compare is consumed by a jump, they
+    // won't be used. We might be able to use GTF_RELOP_JMP_USED to determine this case, but it's not clear
+    // that flag is maintained until this location (especially for decomposed long compares).
+    info->setDstCandidates(m_lsra, RBM_BYTE_REGS);
+#endif // _TARGET_X86_
+
+    GenTreePtr op1     = tree->gtOp.gtOp1;
+    GenTreePtr op2     = tree->gtOp.gtOp2;
+    var_types  op1Type = op1->TypeGet();
+    var_types  op2Type = op2->TypeGet();
+
+#if !defined(_TARGET_64BIT_)
+    // Long compares will consume GT_LONG nodes, each of which produces two results.
+    // Thus for each long operand there will be an additional source.
+    // TODO-X86-CQ: Mark hiOp2 and loOp2 as contained if it is a constant or a memory op.
+    if (varTypeIsLong(op1Type))
+    {
+        info->srcCount++;
+    }
+    if (varTypeIsLong(op2Type))
+    {
+        info->srcCount++;
+    }
+#endif // !defined(_TARGET_64BIT_)
+
+    // If either of op1 or op2 is floating point values, then we need to use
+    // ucomiss or ucomisd to compare, both of which support the following form:
+    //     ucomis[s|d] xmm, xmm/mem
+    // That is only the second operand can be a memory op.
+    //
+    // Second operand is a memory Op:  Note that depending on comparison operator,
+    // the operands of ucomis[s|d] need to be reversed.  Therefore, either op1 or
+    // op2 can be a memory op depending on the comparison operator.
+    if (varTypeIsFloating(op1Type))
+    {
+        // The type of the operands has to be the same and no implicit conversions at this stage.
+        assert(op1Type == op2Type);
+
+        bool reverseOps;
+        if ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0)
+        {
+            // Unordered comparison case
+            reverseOps = tree->OperIs(GT_GT, GT_GE);
+        }
+        else
+        {
+            reverseOps = tree->OperIs(GT_LT, GT_LE);
+        }
+
+        GenTreePtr otherOp;
+        if (reverseOps)
+        {
+            otherOp = op1;
+        }
+        else
+        {
+            otherOp = op2;
+        }
+
+        assert(otherOp != nullptr);
+        if (otherOp->IsCnsNonZeroFltOrDbl())
+        {
+            MakeSrcContained(tree, otherOp);
+        }
+        else if (otherOp->isMemoryOp() && ((otherOp == op2) || IsSafeToContainMem(tree, otherOp)))
+        {
+            MakeSrcContained(tree, otherOp);
+        }
+        else
+        {
+            // SSE2 allows only otherOp to be a memory-op. Since otherOp is not
+            // contained, we can mark it reg-optional.
+            SetRegOptional(otherOp);
+        }
+
+        return;
+    }
+
+    // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
+    // or in other backend.
+
+    if (CheckImmedAndMakeContained(tree, op2))
+    {
+        // If the types are the same, or if the constant is of the correct size,
+        // we can treat the isMemoryOp as contained.
+        if (op1Type == op2Type)
+        {
+            if (op1->isMemoryOp())
+            {
+                MakeSrcContained(tree, op1);
+            }
+            // If op1 codegen sets ZF and SF flags and ==/!= against
+            // zero, we don't need to generate test instruction,
+            // provided we don't have another GenTree node between op1
+            // and tree that could potentially modify flags.
+            //
+            // TODO-CQ: right now the below peep is inexpensive and
+            // gets the benefit in most of cases because in majority
+            // of cases op1, op2 and tree would be in that order in
+            // execution.  In general we should be able to check that all
+            // the nodes that come after op1 in execution order do not
+            // modify the flags so that it is safe to avoid generating a
+            // test instruction.  Such a check requires that on each
+            // GenTree node we need to set the info whether its codegen
+            // will modify flags.
+            //
+            // TODO-CQ: We can optimize compare against zero in the
+            // following cases by generating the branch as indicated
+            // against each case.
+            //  1) unsigned compare
+            //        < 0  - always FALSE
+            //       <= 0  - ZF=1 and jne
+            //        > 0  - ZF=0 and je
+            //       >= 0  - always TRUE
+            //
+            // 2) signed compare
+            //        < 0  - SF=1 and js
+            //       >= 0  - SF=0 and jns
+            else if (tree->OperIs(GT_EQ, GT_NE) && op1->gtSetZSFlags() && op2->IsIntegralConst(0) &&
+                     (op1->gtNext == op2) && (op2->gtNext == tree))
+            {
+                // Require codegen of op1 to set the flags.
+                assert(!op1->gtSetFlags());
+                op1->gtFlags |= GTF_SET_FLAGS;
+            }
+            else
+            {
+                SetRegOptional(op1);
+            }
+        }
+    }
+    else if (op1Type == op2Type)
+    {
+        // Note that TEST does not have a r,rm encoding like CMP has but we can still
+        // contain the second operand because the emitter maps both r,rm and rm,r to
+        // the same instruction code. This avoids the need to special case TEST here.
+        if (op2->isMemoryOp())
+        {
+            MakeSrcContained(tree, op2);
+        }
+        else if (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))
+        {
+            MakeSrcContained(tree, op1);
+        }
+        else if (op1->IsCnsIntOrI())
+        {
+            // TODO-CQ: We should be able to support swapping op1 and op2 to generate cmp reg, imm,
+            // but there is currently an assert in CodeGen::genCompareInt().
+            // https://github.com/dotnet/coreclr/issues/7270
+            SetRegOptional(op2);
+        }
+        else
+        {
+            // One of op1 or op2 could be marked as reg optional
+            // to indicate that codegen can still generate code
+            // if one of them is on stack.
+            SetRegOptional(PreferredRegOptionalOperand(tree));
+        }
+    }
+}
+
+//--------------------------------------------------------------------------------------------
+// TreeNodeInfoInitIfRMWMemOp: Checks to see if there is a RMW memory operation rooted at
+// GT_STOREIND node and if so will mark register requirements for nodes under storeInd so
+// that CodeGen will generate a single instruction of the form:
+//
+//         binOp [addressing mode], reg
+//
+// Parameters
+//         storeInd   - GT_STOREIND node
+//
+// Return value
+//         True, if RMW memory op tree pattern is recognized and op counts are set.
+//         False otherwise.
+//
+bool Lowering::TreeNodeInfoInitIfRMWMemOp(GenTreePtr storeInd)
+{
+    assert(storeInd->OperGet() == GT_STOREIND);
+
+    // SSE2 doesn't support RMW on float values
+    assert(!varTypeIsFloating(storeInd));
+
+    // Terminology:
+    // indirDst = memory write of an addr mode  (i.e. storeind destination)
+    // indirSrc = value being written to memory (i.e. storeind source which could a binary/unary op)
+    // indirCandidate = memory read i.e. a gtInd of an addr mode
+    // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
+
+    GenTreePtr indirCandidate = nullptr;
+    GenTreePtr indirOpSource  = nullptr;
+
+    if (!IsRMWMemOpRootedAtStoreInd(storeInd, &indirCandidate, &indirOpSource))
+    {
+        JITDUMP("Lower of StoreInd didn't mark the node as self contained for reason: %d\n",
+                storeInd->AsStoreInd()->GetRMWStatus());
+        DISPTREERANGE(BlockRange(), storeInd);
+        return false;
+    }
+
+    GenTreePtr indirDst = storeInd->gtGetOp1();
+    GenTreePtr indirSrc = storeInd->gtGetOp2();
+    genTreeOps oper     = indirSrc->OperGet();
+
+    // At this point we have successfully detected a RMW memory op of one of the following forms
+    //         storeInd(indirDst, indirSrc(indirCandidate, indirOpSource)) OR
+    //         storeInd(indirDst, indirSrc(indirOpSource, indirCandidate) in case of commutative operations OR
+    //         storeInd(indirDst, indirSrc(indirCandidate) in case of unary operations
+    //
+    // Here indirSrc = one of the supported binary or unary operation for RMW of memory
+    //      indirCandidate = a GT_IND node
+    //      indirCandidateChild = operand of GT_IND indirCandidate
+    //
+    // The logic below essentially does the following
+    //      Make indirOpSource contained.
+    //      Make indirSrc contained.
+    //      Make indirCandidate contained.
+    //      Make indirCandidateChild contained.
+    //      Make indirDst contained except when it is a GT_LCL_VAR or GT_CNS_INT that doesn't fit within addr
+    //      base.
+    // Note that due to the way containment is supported, we accomplish some of the above by clearing operand counts
+    // and directly propagating them upward.
+    //
+
+    TreeNodeInfo* info = &(storeInd->gtLsraInfo);
+    info->dstCount     = 0;
+
+    if (GenTree::OperIsBinary(oper))
+    {
+        // On Xarch RMW operations require that the source memory-op be in a register.
+        assert(!indirOpSource->isMemoryOp() || indirOpSource->gtLsraInfo.dstCount == 1);
+        JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n");
+        info->srcCount = indirOpSource->gtLsraInfo.dstCount;
+    }
+    else
+    {
+        assert(GenTree::OperIsUnary(oper));
+        JITDUMP("Lower succesfully detected an assignment of the form: *addrMode = UnaryOp(*addrMode)\n");
+        info->srcCount = 0;
+    }
+    DISPTREERANGE(BlockRange(), storeInd);
+
+    m_lsra->clearOperandCounts(indirSrc);
+    m_lsra->clearOperandCounts(indirCandidate);
+
+    GenTreePtr indirCandidateChild = indirCandidate->gtGetOp1();
+    if (indirCandidateChild->OperGet() == GT_LEA)
+    {
+        GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode();
+
+        if (addrMode->HasBase())
+        {
+            assert(addrMode->Base()->OperIsLeaf());
+            m_lsra->clearOperandCounts(addrMode->Base());
+            info->srcCount++;
+        }
+
+        if (addrMode->HasIndex())
+        {
+            assert(addrMode->Index()->OperIsLeaf());
+            m_lsra->clearOperandCounts(addrMode->Index());
+            info->srcCount++;
+        }
+
+        m_lsra->clearOperandCounts(indirDst);
+    }
+    else
+    {
+        assert(indirCandidateChild->OperGet() == GT_LCL_VAR || indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR ||
+               indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR || indirCandidateChild->OperGet() == GT_CNS_INT);
+
+        // If it is a GT_LCL_VAR, it still needs the reg to hold the address.
+        // We would still need a reg for GT_CNS_INT if it doesn't fit within addressing mode base.
+        // For GT_CLS_VAR_ADDR, we don't need a reg to hold the address, because field address value is known at jit
+        // time. Also, we don't need a reg for GT_CLS_VAR_ADDR.
+        if (indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR || indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR)
+        {
+            m_lsra->clearOperandCounts(indirDst);
+        }
+        else if (indirCandidateChild->IsCnsIntOrI() && indirCandidateChild->AsIntConCommon()->FitsInAddrBase(comp))
+        {
+            m_lsra->clearOperandCounts(indirDst);
+        }
+        else
+        {
+            // Need a reg and hence increment src count of storeind
+            info->srcCount += indirCandidateChild->gtLsraInfo.dstCount;
+        }
+    }
+    m_lsra->clearOperandCounts(indirCandidateChild);
+
+#ifdef _TARGET_X86_
+    if (varTypeIsByte(storeInd))
+    {
+        // If storeInd is of TYP_BYTE, set indirOpSources to byteable registers.
+        bool containedNode = indirOpSource->gtLsraInfo.dstCount == 0;
+        if (!containedNode)
+        {
+            regMaskTP regMask = indirOpSource->gtLsraInfo.getSrcCandidates(m_lsra);
+            assert(regMask != RBM_NONE);
+            indirOpSource->gtLsraInfo.setSrcCandidates(m_lsra, regMask & ~RBM_NON_BYTE_REGS);
+        }
+    }
+#endif
+
+    return true;
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitMul: Set the NodeInfo for a multiply.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitMul(GenTreePtr tree)
+{
+#if defined(_TARGET_X86_)
+    assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI || tree->OperGet() == GT_MUL_LONG);
+#else
+    assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI);
+#endif
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+    info->srcCount = 2;
+    info->dstCount = 1;
+
+    GenTreePtr op1 = tree->gtOp.gtOp1;
+    GenTreePtr op2 = tree->gtOp.gtOp2;
+
+    // Case of float/double mul.
+    if (varTypeIsFloating(tree->TypeGet()))
+    {
+        assert(tree->OperGet() == GT_MUL);
+
+        if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
+        {
+            MakeSrcContained(tree, op2);
+        }
+        else if (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1)))
+        {
+            // Since  GT_MUL is commutative, we will try to re-order operands if it is safe to
+            // generate more efficient code sequence for the case of GT_MUL(op1=memOp, op2=non-memOp)
+            MakeSrcContained(tree, op1);
+        }
+        else
+        {
+            // If there are no containable operands, we can make an operand reg optional.
+            SetRegOptionalForBinOp(tree);
+        }
+        return;
+    }
+
+    bool       isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
+    bool       requiresOverflowCheck = tree->gtOverflowEx();
+    bool       useLeaEncoding        = false;
+    GenTreePtr memOp                 = nullptr;
+
+    bool                 hasImpliedFirstOperand = false;
+    GenTreeIntConCommon* imm                    = nullptr;
+    GenTreePtr           other                  = nullptr;
+
+    // There are three forms of x86 multiply:
+    // one-op form:     RDX:RAX = RAX * r/m
+    // two-op form:     reg *= r/m
+    // three-op form:   reg = r/m * imm
+
+    // This special widening 32x32->64 MUL is not used on x64
+    CLANG_FORMAT_COMMENT_ANCHOR;
+#if defined(_TARGET_X86_)
+    if (tree->OperGet() != GT_MUL_LONG)
+#endif
+    {
+        assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
+    }
+
+    // Multiply should never be using small types
+    assert(!varTypeIsSmall(tree->TypeGet()));
+
+    // We do use the widening multiply to implement
+    // the overflow checking for unsigned multiply
+    //
+    if (isUnsignedMultiply && requiresOverflowCheck)
+    {
+        // The only encoding provided is RDX:RAX = RAX * rm
+        //
+        // Here we set RAX as the only destination candidate
+        // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
+        //
+        info->setDstCandidates(m_lsra, RBM_RAX);
+        hasImpliedFirstOperand = true;
+    }
+    else if (tree->OperGet() == GT_MULHI)
+    {
+        // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
+        // upper 32 bits of the result set the destination candidate to REG_RDX.
+        info->setDstCandidates(m_lsra, RBM_RDX);
+        hasImpliedFirstOperand = true;
+    }
+#if defined(_TARGET_X86_)
+    else if (tree->OperGet() == GT_MUL_LONG)
+    {
+        // have to use the encoding:RDX:RAX = RAX * rm
+        info->setDstCandidates(m_lsra, RBM_RAX);
+        hasImpliedFirstOperand = true;
+    }
+#endif
+    else if (IsContainableImmed(tree, op2) || IsContainableImmed(tree, op1))
+    {
+        if (IsContainableImmed(tree, op2))
+        {
+            imm   = op2->AsIntConCommon();
+            other = op1;
+        }
+        else
+        {
+            imm   = op1->AsIntConCommon();
+            other = op2;
+        }
+
+        // CQ: We want to rewrite this into a LEA
+        ssize_t immVal = imm->AsIntConCommon()->IconValue();
+        if (!requiresOverflowCheck && (immVal == 3 || immVal == 5 || immVal == 9))
+        {
+            useLeaEncoding = true;
+        }
+
+        MakeSrcContained(tree, imm); // The imm is always contained
+        if (other->isMemoryOp())
+        {
+            memOp = other; // memOp may be contained below
+        }
+    }
+
+    // We allow one operand to be a contained memory operand.
+    // The memory op type must match with the 'tree' type.
+    // This is because during codegen we use 'tree' type to derive EmitTypeSize.
+    // E.g op1 type = byte, op2 type = byte but GT_MUL tree type is int.
+    //
+    if (memOp == nullptr && op2->isMemoryOp())
+    {
+        memOp = op2;
+    }
+
+    // To generate an LEA we need to force memOp into a register
+    // so don't allow memOp to be 'contained'
+    //
+    if (!useLeaEncoding)
+    {
+        if ((memOp != nullptr) && (memOp->TypeGet() == tree->TypeGet()) && IsSafeToContainMem(tree, memOp))
+        {
+            MakeSrcContained(tree, memOp);
+        }
+        else if (imm != nullptr)
+        {
+            // Has a contained immediate operand.
+            // Only 'other' operand can be marked as reg optional.
+            assert(other != nullptr);
+            SetRegOptional(other);
+        }
+        else if (hasImpliedFirstOperand)
+        {
+            // Only op2 can be marke as reg optional.
+            SetRegOptional(op2);
+        }
+        else
+        {
+            // If there are no containable operands, we can make either of op1 or op2
+            // as reg optional.
+            SetRegOptionalForBinOp(tree);
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
+// Contains256bitAVX flag when SIMD vector size is 32 bytes
+//
+// Arguments:
+//    isFloatingPointType   - true if it is floating point type
+//    sizeOfSIMDVector      - SIMD Vector size
+//
+void Lowering::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
+{
+#ifdef FEATURE_AVX_SUPPORT
+    if (isFloatingPointType)
+    {
+        if (comp->getFloatingPointInstructionSet() == InstructionSet_AVX)
+        {
+            comp->getEmitter()->SetContainsAVX(true);
+        }
+        if (sizeOfSIMDVector == 32 && comp->getSIMDInstructionSet() == InstructionSet_AVX)
+        {
+            comp->getEmitter()->SetContains256bitAVX(true);
+        }
+    }
+#endif
+}
+
+#ifdef _TARGET_X86_
+//------------------------------------------------------------------------
+// ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for
+// various reasons
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    If we need to exclude non-byteable registers
+//
+bool Lowering::ExcludeNonByteableRegisters(GenTree* tree)
+{
+    // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
+    // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
+    // value. In this case we need to exclude esi/edi from the src candidates of op2.
+    if (varTypeIsByte(tree))
+    {
+        return true;
+    }
+    // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
+    else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType()))
+    {
+        return true;
+    }
+    else if (tree->OperIsCompare())
+    {
+        GenTree* op1 = tree->gtGetOp1();
+        GenTree* op2 = tree->gtGetOp2();
+
+        // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
+        // ubyte as the result of comparison and if the result needs to be materialized into a reg
+        // simply zero extend it to TYP_INT size.  Here is an example of generated code:
+        //         cmp dl, byte ptr[addr mode]
+        //         movzx edx, dl
+        if (varTypeIsByte(op1) && varTypeIsByte(op2))
+        {
+            return true;
+        }
+        // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses
+        // ubyte as the result of the comparison and if the result needs to be materialized into a reg
+        // simply zero extend it to TYP_INT size.
+        else if (varTypeIsByte(op1) && op2->IsCnsIntOrI())
+        {
+            return true;
+        }
+        // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses
+        // ubyte as the result of the comparison and if the result needs to be materialized into a reg
+        // simply zero extend it to TYP_INT size.
+        else if (op1->IsCnsIntOrI() && varTypeIsByte(op2))
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+#ifdef FEATURE_SIMD
+    else if (tree->OperGet() == GT_SIMD)
+    {
+        GenTreeSIMD* simdNode = tree->AsSIMD();
+        switch (simdNode->gtSIMDIntrinsicID)
+        {
+            case SIMDIntrinsicOpEquality:
+            case SIMDIntrinsicOpInEquality:
+                // We manifest it into a byte register, so the target must be byteable.
+                return true;
+
+            case SIMDIntrinsicGetItem:
+            {
+                // This logic is duplicated from genSIMDIntrinsicGetItem().
+                // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
+                // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
+                // cases will require this, so the non-byteable registers can be excluded.
+
+                GenTree*  op1      = simdNode->gtGetOp1();
+                GenTree*  op2      = simdNode->gtGetOp2();
+                var_types baseType = simdNode->gtSIMDBaseType;
+                if (!op1->isMemoryOp() && op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
+                {
+                    bool     ZeroOrSignExtnReqd = true;
+                    unsigned baseSize           = genTypeSize(baseType);
+                    if (baseSize == 1)
+                    {
+                        if ((op2->gtIntCon.gtIconVal % 2) == 1)
+                        {
+                            ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
+                        }
+                    }
+                    else
+                    {
+                        assert(baseSize == 2);
+                        ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
+                    }
+                    return ZeroOrSignExtnReqd;
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+        return false;
+    }
+#endif // FEATURE_SIMD
+    else
+    {
+        return false;
+    }
+}
+#endif // _TARGET_X86_
+
+#endif // _TARGET_XARCH_
+
+#endif // !LEGACY_BACKEND