// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XX                                                                           XX
XX                           Lowering for AMD64                              XX
XX                                                                           XX
XX  This encapsulates all the logic for lowering trees for the AMD64         XX
XX  architecture.  For a more detailed view of what is lowering, please      XX
XX  take a look at Lower.cpp                                                 XX
XX                                                                           XX
XX                                                                           XX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
*/

#include "jitpch.h"
#ifdef _MSC_VER
#pragma hdrstop
#endif

#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator

#ifdef _TARGET_XARCH_

#include "jit.h"
#include "lower.h"

// xarch supports both ROL and ROR instructions so no lowering is required.
void Lowering::LowerRotate(GenTreePtr tree)
{
}

// there is not much lowering to do with storing a local but 
// we do some handling of contained immediates and widening operations of unsigneds
void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
{
    TreeNodeInfo* info = &(storeLoc->gtLsraInfo);

    // Is this the case of var = call where call is returning
    // a value in multiple return registers?
    GenTree* op1 = storeLoc->gtGetOp1();
    if (op1->IsMultiRegCall())
    {
        // backend expects to see this case only for store lclvar.
        assert(storeLoc->OperGet() == GT_STORE_LCL_VAR);

        // srcCount = number of registers in which the value is returned by call
        GenTreeCall* call = op1->AsCall();
        ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
        info->srcCount = retTypeDesc->GetReturnRegCount();

        // Call node srcCandidates = Bitwise-OR(allregs(GetReturnRegType(i))) for all i=0..RetRegCount-1
        regMaskTP srcCandidates = m_lsra->allMultiRegCallNodeRegs(call);
        op1->gtLsraInfo.setSrcCandidates(m_lsra, srcCandidates);
        return;
    }

#ifdef FEATURE_SIMD    
    if (storeLoc->TypeGet() == TYP_SIMD12)
    {
        // Need an additional register to extract upper 4 bytes of Vector3.
        info->internalFloatCount = 1;
        info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());

        // In this case don't mark the operand as contained as we want it to
        // be evaluated into an xmm register
        return;
    }
#endif // FEATURE_SIMD

    // If the source is a containable immediate, make it contained, unless it is
    // an int-size or larger store of zero to memory, because we can generate smaller code
    // by zeroing a register and then storing it.
    if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(storeLoc)))
    {
        MakeSrcContained(storeLoc, op1);
    }

    // Try to widen the ops if they are going into a local var.
    if ((storeLoc->gtOper == GT_STORE_LCL_VAR) &&
        (storeLoc->gtOp1->gtOper == GT_CNS_INT))
    {
        GenTreeIntCon* con = storeLoc->gtOp1->AsIntCon();
        ssize_t       ival = con->gtIconVal;

        unsigned        varNum = storeLoc->gtLclNum;
        LclVarDsc*      varDsc = comp->lvaTable + varNum;

        if (varDsc->lvIsSIMDType())
        {
            noway_assert(storeLoc->gtType != TYP_STRUCT);
        }
        unsigned size = genTypeSize(storeLoc);
        // If we are storing a constant into a local variable
        // we extend the size of the store here 
        if ((size < 4) && !varTypeIsStruct(varDsc))
        {
            if (!varTypeIsUnsigned(varDsc))
            {
                if (genTypeSize(storeLoc) == 1)
                {
                    if ((ival & 0x7f) != ival)
                    {
                        ival = ival | 0xffffff00;
                    }
                }
                else
                {
                    assert(genTypeSize(storeLoc) == 2);
                    if ((ival & 0x7fff) != ival)
                    {
                        ival = ival | 0xffff0000;
                    }
                }
            }

            // A local stack slot is at least 4 bytes in size, regardless of
            // what the local var is typed as, so auto-promote it here
            // unless it is a field of a promoted struct
            // TODO-XArch-CQ: if the field is promoted shouldn't we also be able to do this?
            if (!varDsc->lvIsStructField)
            {
                storeLoc->gtType = TYP_INT;
                con->SetIconValue(ival);
            }
        }
    }
}


/**
 * Takes care of annotating the register requirements 
 * for every TreeNodeInfo struct that maps to each tree node.
 * Preconditions:
 *    LSRA Has been initialized and there is a TreeNodeInfo node
 *    already allocated and initialized for every tree in the IR.
 * Postconditions:
 *    Every TreeNodeInfo instance has the right annotations on register
 *    requirements needed by LSRA to build the Interval Table (source, 
 *    destination and internal [temp] register counts).
 *    This code is refactored originally from LSRA.
 */
void Lowering::TreeNodeInfoInit(GenTree* stmt)
{
    LinearScan* l = m_lsra;
    Compiler* compiler = comp;

    assert(stmt->gtStmt.gtStmtIsTopLevel());
    GenTree* tree = stmt->gtStmt.gtStmtList;
    
    while (tree)
    {
        TreeNodeInfo* info = &(tree->gtLsraInfo);
        GenTree* next = tree->gtNext;

        switch (tree->OperGet())
        {
            GenTree* op1;
            GenTree* op2;

        default:
            TreeNodeInfoInitSimple(tree);
            break;

        case GT_LCL_FLD:
            info->srcCount = 0;
            info->dstCount = 1;

#ifdef FEATURE_SIMD
            // Need an additional register to read upper 4 bytes of Vector3.
            if (tree->TypeGet() == TYP_SIMD12)
            {
                // We need an internal register different from targetReg in which 'tree' produces its result
                // because both targetReg and internal reg will be in use at the same time. This is achieved
                // by asking for two internal registers.
                info->internalFloatCount = 2;
                info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
            }
#endif
            break;
            
        case GT_STORE_LCL_FLD:
        case GT_STORE_LCL_VAR:
            info->srcCount = 1;
            info->dstCount = 0;
            LowerStoreLoc(tree->AsLclVarCommon());
            break;

        case GT_BOX:
            noway_assert(!"box should not exist here");
            // The result of 'op1' is also the final result
            info->srcCount = 0;
            info->dstCount = 0;
            break;

        case GT_PHYSREGDST:
            info->srcCount = 1;
            info->dstCount = 0;
            break;

        case GT_COMMA:
            {
                GenTreePtr firstOperand;
                GenTreePtr secondOperand;
                if (tree->gtFlags & GTF_REVERSE_OPS)
                {
                    firstOperand  = tree->gtOp.gtOp2;
                    secondOperand = tree->gtOp.gtOp1;
                }
                else
                {
                    firstOperand  = tree->gtOp.gtOp1;
                    secondOperand = tree->gtOp.gtOp2;
                }
                if (firstOperand->TypeGet() != TYP_VOID)
                {
                    firstOperand->gtLsraInfo.isLocalDefUse = true;
                    firstOperand->gtLsraInfo.dstCount = 0;
                }
                if (tree->TypeGet() == TYP_VOID && secondOperand->TypeGet() != TYP_VOID)
                {
                    secondOperand->gtLsraInfo.isLocalDefUse = true;
                    secondOperand->gtLsraInfo.dstCount = 0;
                }
            }
            info->srcCount = 0;
            info->dstCount = 0;
            break;

        case GT_LIST:
        case GT_ARGPLACE:
        case GT_NO_OP:
        case GT_START_NONGC:
        case GT_PROF_HOOK:
            info->srcCount = 0;
            info->dstCount = 0;
            break;

        case GT_CNS_DBL:
            info->srcCount = 0;
            info->dstCount = 1;
            break;

#if !defined(_TARGET_64BIT_)

        case GT_LONG:
            if (tree->gtNext == nullptr)
            {
                // An uncontained GT_LONG node needs to consume its source operands
                info->srcCount = 2;
            }
            else
            {
                // Passthrough
                info->srcCount = 0;
            }
            info->dstCount = 0;
            break;

#endif // !defined(_TARGET_64BIT_)

        case GT_QMARK:
        case GT_COLON:
            info->srcCount = 0;
            info->dstCount = 0;
            unreached();
            break;

        case GT_RETURN:
            TreeNodeInfoInitReturn(tree);
            break;

        case GT_RETFILT:
            if (tree->TypeGet() == TYP_VOID)
            {
                info->srcCount = 0;
                info->dstCount = 0;
            }
            else
            {
                assert(tree->TypeGet() == TYP_INT);

                info->srcCount = 1;
                info->dstCount = 1;

                info->setSrcCandidates(l, RBM_INTRET);
                tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET);
            }
            break;

            // A GT_NOP is either a passthrough (if it is void, or if it has
            // a child), but must be considered to produce a dummy value if it
            // has a type but no child
        case GT_NOP:
            info->srcCount = 0;
            if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
            {
                info->dstCount = 1;
            }
            else
            {
                info->dstCount = 0;
            }
            break;

        case GT_JTRUE:
            info->srcCount = 0;
            info->dstCount = 0;
            l->clearDstCount(tree->gtOp.gtOp1);
            break;

        case GT_JMP:
            info->srcCount = 0;
            info->dstCount = 0;
            break;

        case GT_SWITCH:
            // This should never occur since switch nodes must not be visible at this
            // point in the JIT.
            info->srcCount = 0;
            info->dstCount = 0;  // To avoid getting uninit errors.
            noway_assert(!"Switch must be lowered at this point");
            break;

        case GT_JMPTABLE:
            info->srcCount = 0;
            info->dstCount = 1;
            break;

        case GT_SWITCH_TABLE:
            info->srcCount = 2;
            info->internalIntCount = 1;
            info->dstCount = 0;
            break;

        case GT_ASG:
        case GT_ASG_ADD:
        case GT_ASG_SUB:
            noway_assert(!"We should never hit any assignment operator in lowering");
            info->srcCount = 0;
            info->dstCount = 0;
            break;

#if !defined(_TARGET_64BIT_)
        case GT_ADD_LO:
        case GT_ADD_HI:
        case GT_SUB_LO:
        case GT_SUB_HI:
#endif
        case GT_ADD:
        case GT_SUB:
            // SSE2 arithmetic instructions doesn't support the form "op mem, xmm".  
            // Rather they only support "op xmm, mem/xmm" form.
            if (varTypeIsFloating(tree->TypeGet()))
            {
                // overflow operations aren't supported on float/double types.
                assert(!tree->gtOverflow());

                op1 = tree->gtGetOp1();
                op2 = tree->gtGetOp2();

                // No implicit conversions at this stage as the expectation is that
                // everything is made explicit by adding casts.
                assert(op1->TypeGet() == op2->TypeGet());

                info->srcCount = 2;
                info->dstCount = 1;              
 
                if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
                {
                    MakeSrcContained(tree, op2);
                }
                else if (tree->OperIsCommutative() && 
                         (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))))
                {
                    // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
                    // as long as it is safe so that the following efficient code sequence is generated:
                    //      addss/sd targetReg, memOp    (if op1Reg == targetReg) OR
                    //      movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
                    //
                    // Instead of
                    //      movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg  (if op1Reg == targetReg) OR
                    //      movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg  
                    MakeSrcContained(tree, op1);
                }
                else
                {
                    // If there are no containable operands, we can make an operand reg optional.
                    SetRegOptionalForBinOp(tree);
                }
                break;
            }

            __fallthrough;

        case GT_AND:
        case GT_OR:
        case GT_XOR:
            TreeNodeInfoInitLogicalOp(tree);
            break;
      
        case GT_RETURNTRAP:
            // this just turns into a compare of its child with an int
            // + a conditional call
            info->srcCount = 1;
            info->dstCount = 0;
            if (tree->gtOp.gtOp1->isIndir())
            {
                MakeSrcContained(tree, tree->gtOp.gtOp1);
            }
            info->internalIntCount = 1;
            info->setInternalCandidates(l, l->allRegs(TYP_INT));
            break;

        case GT_MOD:
        case GT_DIV:
        case GT_UMOD:
        case GT_UDIV:
            TreeNodeInfoInitModDiv(tree);
            break;

        case GT_MUL:
        case GT_MULHI:
            SetMulOpCounts(tree);
            break;
        
        case GT_INTRINSIC:
            TreeNodeInfoInitIntrinsic(tree);
            break;

#ifdef FEATURE_SIMD
        case GT_SIMD:
            TreeNodeInfoInitSIMD(tree);
            break;
#endif // FEATURE_SIMD

        case GT_CAST:
            TreeNodeInfoInitCast(tree);
            break;

        case GT_NEG:
            info->srcCount = 1;
            info->dstCount = 1;

            // TODO-XArch-CQ:
            // SSE instruction set doesn't have an instruction to negate a number.
            // The recommended way is to xor the float/double number with a bitmask.
            // The only way to xor is using xorps or xorpd both of which operate on 
            // 128-bit operands.  To hold the bit-mask we would need another xmm
            // register or a 16-byte aligned 128-bit data constant. Right now emitter
            // lacks the support for emitting such constants or instruction with mem
            // addressing mode referring to a 128-bit operand. For now we use an 
            // internal xmm register to load 32/64-bit bitmask from data section.
            // Note that by trading additional data section memory (128-bit) we can
            // save on the need for an internal register and also a memory-to-reg
            // move.
            //
            // Note: another option to avoid internal register requirement is by
            // lowering as GT_SUB(0, src).  This will generate code different from
            // Jit64 and could possibly result in compat issues (?).
            if (varTypeIsFloating(tree))
            {
                info->internalFloatCount = 1;
                info->setInternalCandidates(l, l->internalFloatRegCandidates());
            }
            break;
        
        case GT_NOT:
            info->srcCount = 1;
            info->dstCount = 1;
            break;

        case GT_LSH:
        case GT_RSH:
        case GT_RSZ:
        case GT_ROL:
        case GT_ROR:
            TreeNodeInfoInitShiftRotate(tree);
            break;

        case GT_EQ:
        case GT_NE:
        case GT_LT:
        case GT_LE:
        case GT_GE:
        case GT_GT:
            LowerCmp(tree);
            break;

        case GT_CKFINITE:
            info->srcCount = 1;
            info->dstCount = 1;
            info->internalIntCount = 1;
            break;

        case GT_CMPXCHG:
            info->srcCount = 3;
            info->dstCount = 1;

            // comparand is preferenced to RAX.
            // Remaining two operands can be in any reg other than RAX.
            tree->gtCmpXchg.gtOpComparand->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
            tree->gtCmpXchg.gtOpLocation->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
            tree->gtCmpXchg.gtOpValue->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
            tree->gtLsraInfo.setDstCandidates(l, RBM_RAX);
            break;

        case GT_LOCKADD:
            info->srcCount = 2;
            info->dstCount = 0;

            CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
            break;

        case GT_CALL:
            TreeNodeInfoInitCall(tree->AsCall());
            break;

        case GT_ADDR:
            {
                // For a GT_ADDR, the child node should not be evaluated into a register
                GenTreePtr child = tree->gtOp.gtOp1;
                assert(!l->isCandidateLocalRef(child));
                l->clearDstCount(child);
                info->srcCount = 0;
                info->dstCount = 1;
            }
            break;

#ifdef _TARGET_X86_
        case GT_OBJ:
            NYI_X86("GT_OBJ");
#endif //_TARGET_X86_

        case GT_INITBLK:
        case GT_COPYBLK:
        case GT_COPYOBJ:
            TreeNodeInfoInitBlockStore(tree->AsBlkOp());
            break;

#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
        case GT_PUTARG_STK:
            TreeNodeInfoInitPutArgStk(tree);
            break;
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING

        case GT_LCLHEAP:
            TreeNodeInfoInitLclHeap(tree);
            break;

        case GT_ARR_BOUNDS_CHECK:
#ifdef FEATURE_SIMD
        case GT_SIMD_CHK:
#endif // FEATURE_SIMD
            {
                GenTreeBoundsChk* node = tree->AsBoundsChk();
                // Consumes arrLen & index - has no result
                info->srcCount = 2;
                info->dstCount = 0;
            
                GenTreePtr other = nullptr;
                if (CheckImmedAndMakeContained(tree, node->gtIndex))
                {
                    other = node->gtArrLen;
                }
                else if (CheckImmedAndMakeContained(tree, node->gtArrLen))
                {
                    other = node->gtIndex;
                }
                else if (node->gtIndex->isMemoryOp())
                {
                    other = node->gtIndex;
                }
                else
                {
                    other = node->gtArrLen;
                }

                if (other->isMemoryOp())
                {
                    if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
                    {
                        MakeSrcContained(tree, other);
                    }
                }
                else
                {
                    // since 'other' operand is not contained, we can mark it as reg optional
                    TryToSetRegOptional(other);
                }
            }
            break;

        case GT_ARR_ELEM:
            // These must have been lowered to GT_ARR_INDEX
            noway_assert(!"We should never see a GT_ARR_ELEM in lowering");
            info->srcCount = 0;
            info->dstCount = 0;
            break;

        case GT_ARR_INDEX:
            info->srcCount = 2;
            info->dstCount = 1;
            // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
            // times while the result is being computed.
            tree->AsArrIndex()->ArrObj()->gtLsraInfo.isDelayFree = true;
            info->hasDelayFreeSrc = true;
            break;

        case GT_ARR_OFFSET:
            // This consumes the offset, if any, the arrObj and the effective index,
            // and produces the flattened offset for this dimension.
            info->srcCount = 3;
            info->dstCount = 1;
            info->internalIntCount = 1;
            // we don't want to generate code for this
            if (tree->gtArrOffs.gtOffset->IsIntegralConst(0))
            {
                MakeSrcContained(tree, tree->gtArrOffs.gtOffset);
            }
            break;

        case GT_LEA:
            // The LEA usually passes its operands through to the GT_IND, in which case we'll
            // clear the info->srcCount and info->dstCount later, but we may be instantiating an address,
            // so we set them here.
            info->srcCount = 0;
            if (tree->AsAddrMode()->HasBase())
            {
                info->srcCount++;
            }
            if (tree->AsAddrMode()->HasIndex())
            {
                info->srcCount++;
            }
            info->dstCount = 1;
            break;

        case GT_STOREIND:
            {
                info->srcCount = 2;
                info->dstCount = 0;
                GenTree* src = tree->gtOp.gtOp2;

                if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
                {
                    LowerGCWriteBarrier(tree);
                    break;
                }

                // If the source is a containable immediate, make it contained, unless it is
                // an int-size or larger store of zero to memory, because we can generate smaller code
                // by zeroing a register and then storing it.
                if (IsContainableImmed(tree, src) &&
                    (!src->IsIntegralConst(0) || varTypeIsSmall(tree) || tree->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
                {
                    MakeSrcContained(tree, src);
                }
                else if (!varTypeIsFloating(tree))
                {
                    // Perform recognition of trees with the following structure:
                    //        StoreInd(addr, BinOp(expr, GT_IND(addr)))
                    // to be able to fold this into an instruction of the form
                    //        BINOP [addr], register
                    // where register is the actual place where 'expr' is computed.
                    //
                    // SSE2 doesn't support RMW form of instructions.
                    if (SetStoreIndOpCountsIfRMWMemOp(tree))
                    {
                        break;
                    }
                }

                SetIndirAddrOpCounts(tree);
            }
            break;
        
        case GT_NULLCHECK:
            info->dstCount = 0;
            info->srcCount = 1;
            info->isLocalDefUse = true;
            break;

        case GT_IND:
            info->dstCount = 1;
            info->srcCount = 1;            
            SetIndirAddrOpCounts(tree);
            break;

        case GT_CATCH_ARG:
            info->srcCount = 0;
            info->dstCount = 1;
            info->setDstCandidates(l, RBM_EXCEPTION_OBJECT);
            break;

#if !FEATURE_EH_FUNCLETS
        case GT_END_LFIN:
            info->srcCount = 0;
            info->dstCount = 0;
            break;
#endif

        case GT_CLS_VAR:
            info->srcCount = 0;
            // GT_CLS_VAR, by the time we reach the backend, must always
            // be a pure use.
            // It will produce a result of the type of the
            // node, and use an internal register for the address.

            info->dstCount = 1;
            assert((tree->gtFlags & (GTF_VAR_DEF|GTF_VAR_USEASG|GTF_VAR_USEDEF)) == 0);
            info->internalIntCount = 1;
            break;
        } // end switch (tree->OperGet())

        // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
        // Even then we would like to set isTgtPref on Op1.
        if (tree->OperIsBinary() && info->srcCount >= 1)
        {
            if (isRMWRegOper(tree))
            {
                GenTree* op1 = tree->gtOp.gtOp1;
                GenTree* op2 = tree->gtOp.gtOp2;
                
                // Commutative opers like add/mul/and/or/xor could reverse the order of
                // operands if it is safe to do so.  In such a case we would like op2 to be
                // target preferenced instead of op1.
                if (tree->OperIsCommutative() && op1->gtLsraInfo.dstCount == 0 && op2 != nullptr)
                {
                    op1 = op2;
                    op2 = tree->gtOp.gtOp1;
                }

                // If we have a read-modify-write operation, we want to preference op1 to the target.
                // If op1 is contained, we don't want to preference it, but it won't
                // show up as a source in that case, so it will be ignored.
                op1->gtLsraInfo.isTgtPref = true;

                // Is this a non-commutative operator, or is op2 a contained memory op?
                // (Note that we can't call IsContained() at this point because it uses exactly the
                // same information we're currently computing.)
                // In either case, we need to make op2 remain live until the op is complete, by marking
                // the source(s) associated with op2 as "delayFree".
                // Note that if op2 of a binary RMW operator is a memory op, even if the operator
                // is commutative, codegen cannot reverse them.
                // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
                // more work to be done to correctly reverse the operands if they involve memory
                // operands.  Also, we may need to handle more cases than GT_IND, especially once
                // we've modified the register allocator to not require all nodes to be assigned
                // a register (e.g. a spilled lclVar can often be referenced directly from memory).
                // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.

                GenTree* delayUseSrc = nullptr;
                // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
                // to special case them.
                if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD)
                {
                    delayUseSrc = op1;
                }
                else if ((op2 != nullptr) &&
                         (!tree->OperIsCommutative() || (op2->isMemoryOp() && (op2->gtLsraInfo.srcCount == 0))))
                {
                    delayUseSrc = op2;
                }
                if (delayUseSrc != nullptr)
                {
                    // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree'
                    // on the base & index, if any.
                    // Otherwise, we set it on delayUseSrc itself.
                    if (delayUseSrc->isIndir() && (delayUseSrc->gtLsraInfo.dstCount == 0))
                    {
                        GenTree* base = delayUseSrc->AsIndir()->Base();
                        GenTree* index = delayUseSrc->AsIndir()->Index();
                        if (base != nullptr)
                        {
                            base->gtLsraInfo.isDelayFree = true;
                        }
                        if (index != nullptr)
                        {
                            index->gtLsraInfo.isDelayFree = true;
                        }
                    }
                    else
                    {
                        delayUseSrc->gtLsraInfo.isDelayFree = true;
                    }
                    info->hasDelayFreeSrc = true;
                }
            }
        }

#ifdef _TARGET_X86_
        // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
        // if the tree node is a byte type.
        // 
        // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
        // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
        // value. In this case we need to exclude esi/edi from the src candidates of op2.
        // 
        // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
        //
        // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
        // ubyte as the result of comparison and if the result needs to be materialized into a reg
        // simply zero extend it to TYP_INT size.  Here is an example of generated code:
        //         cmp dl, byte ptr[addr mode]
        //         movzx edx, dl
        //
        // Though this looks conservative in theory, in practice we could not think of a case where
        // the below logic leads to conservative register specification.  In future when or if we find
        // one such case, this logic needs to be fine tuned for that case(s).
        if (varTypeIsByte(tree) || 
            ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType())) ||
            (tree->OperIsCompare() && varTypeIsByte(tree->gtGetOp1()) && varTypeIsByte(tree->gtGetOp2()))
           )
        {
            regMaskTP regMask;
            if (info->dstCount > 0)
            {
                regMask = info->getDstCandidates(l);
                assert(regMask != RBM_NONE);
                info->setDstCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
            }

            if (tree->OperIsSimple() && (info->srcCount > 0))
            {
                // No need to set src candidates on a contained child operand.
                GenTree *op = tree->gtOp.gtOp1;
                assert(op != nullptr);
                bool containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
                if (!containedNode)
                {
                    regMask = op->gtLsraInfo.getSrcCandidates(l);
                    assert(regMask != RBM_NONE);
                    op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
                }

                if (tree->OperIsBinary() && (tree->gtOp.gtOp2 != nullptr))
                {
                    op = tree->gtOp.gtOp2;
                    containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);                
                    if (!containedNode)
                    {
                        regMask = op->gtLsraInfo.getSrcCandidates(l);
                        assert(regMask != RBM_NONE);
                        op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
                    }
                }
            }
        }
#endif //_TARGET_X86_

        // We need to be sure that we've set info->srcCount and info->dstCount appropriately
        assert((info->dstCount < 2) ||
               (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));

        tree = next;
    }
}


//------------------------------------------------------------------------
// TreeNodeInfoInitSimple: Sets the srcCount and dstCount for all the trees
// without special handling based on the tree node type.
//
// Arguments:
//    tree      - The node of interest
//
// Return Value:
//    None.
//
void Lowering::TreeNodeInfoInitSimple(GenTree* tree)
{
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    unsigned kind = tree->OperKind();
    info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
    if (kind & (GTK_CONST | GTK_LEAF))
    {
        info->srcCount = 0;
    }
    else if (kind & (GTK_SMPOP))
    {
        if (tree->gtGetOp2() != nullptr)
        {
            info->srcCount = 2;
        }
        else
        {
            info->srcCount = 1;
        }
    }
    else
    {
        unreached();
    }
}


//------------------------------------------------------------------------
// TreeNodeInfoInitReturn: Set the NodeInfo for a GT_RETURN.
//
// Arguments:
//    tree      - The node of interest
//
// Return Value:
//    None.
//
void
Lowering::TreeNodeInfoInitReturn(GenTree* tree)
{
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    LinearScan* l = m_lsra;
    Compiler* compiler = comp;

#if !defined(_TARGET_64BIT_)
    if (tree->TypeGet() == TYP_LONG)
    {
        GenTree* op1 = tree->gtGetOp1();
        noway_assert(op1->OperGet() == GT_LONG);
        GenTree* loVal = op1->gtGetOp1();
        GenTree* hiVal = op1->gtGetOp2();
        info->srcCount = 2;
        loVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_LO);
        hiVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_HI);
        info->dstCount = 0;
    }
    else
#endif // !defined(_TARGET_64BIT_)
    {
        GenTree* op1 = tree->gtGetOp1();
        regMaskTP useCandidates = RBM_NONE;

        info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
        info->dstCount = 0;

#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
        if (varTypeIsStruct(tree))
        {                    
            // op1 has to be either an lclvar or a multi-reg returning call
            if (op1->OperGet() == GT_LCL_VAR)
            {
                GenTreeLclVarCommon* lclVarCommon = op1->AsLclVarCommon();
                LclVarDsc* varDsc = &(compiler->lvaTable[lclVarCommon->gtLclNum]);
                assert(varDsc->lvIsMultiRegRet);

                // Mark var as contained if not enregistrable.
                if (!varTypeIsEnregisterableStruct(op1))
                {
                    MakeSrcContained(tree, op1);
                }
            }
            else
            {
                noway_assert(op1->IsMultiRegCall());

                ReturnTypeDesc* retTypeDesc = op1->AsCall()->GetReturnTypeDesc();
                info->srcCount = retTypeDesc->GetReturnRegCount();
                useCandidates = retTypeDesc->GetABIReturnRegs();
            }
        }
        else
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
        {
            // Non-struct type return - determine useCandidates                   
            switch (tree->TypeGet())
            {
            case TYP_VOID:   useCandidates = RBM_NONE; break;
            case TYP_FLOAT:  useCandidates = RBM_FLOATRET; break;
            case TYP_DOUBLE: useCandidates = RBM_DOUBLERET; break;
#if defined(_TARGET_64BIT_)
            case TYP_LONG:   useCandidates = RBM_LNGRET; break;
#endif // defined(_TARGET_64BIT_)
            default:         useCandidates = RBM_INTRET; break;
            }
        }

        if (useCandidates != RBM_NONE)
        {
            op1->gtLsraInfo.setSrcCandidates(l, useCandidates);
        }
    }
}


//------------------------------------------------------------------------
// TreeNodeInfoInitShiftRotate: Set the NodeInfo for a shift or rotate.
//
// Arguments:
//    tree      - The node of interest
//
// Return Value:
//    None.
//
void
Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree)
{
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    LinearScan* l = m_lsra;

    info->srcCount = 2;
    info->dstCount = 1;

    // For shift operations, we need that the number
    // of bits moved gets stored in CL in case 
    // the number of bits to shift is not a constant.
    GenTreePtr shiftBy = tree->gtOp.gtOp2;
    GenTreePtr source = tree->gtOp.gtOp1;

    // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
    // We will allow whatever can be encoded - hope you know what you are doing.
    if (!IsContainableImmed(tree, shiftBy) ||
        (shiftBy->gtIntConCommon.IconValue() > 255) ||
        (shiftBy->gtIntConCommon.IconValue() < 0))
    {
        source->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
        shiftBy->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
        info->setDstCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
    }
    else
    {
        MakeSrcContained(tree, shiftBy);
    }
}


//------------------------------------------------------------------------
// TreeNodeInfoInitCall: Set the NodeInfo for a call.
//
// Arguments:
//    call      - The call node of interest
//
// Return Value:
//    None.
//
void
Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
{
    TreeNodeInfo* info = &(call->gtLsraInfo);
    LinearScan* l = m_lsra;
    Compiler* compiler = comp;
    bool hasMultiRegRetVal = false;
    ReturnTypeDesc* retTypeDesc = nullptr;

    info->srcCount = 0;
    if (call->TypeGet() != TYP_VOID)
    {
        hasMultiRegRetVal = call->HasMultiRegRetVal();
        if (hasMultiRegRetVal)
        {
            // dst count = number of registers in which the value is returned by call
            retTypeDesc = call->GetReturnTypeDesc();
            info->dstCount = retTypeDesc->GetReturnRegCount();
        }
        else
        {
            info->dstCount = 1;
        }
    }
    else
    {
        info->dstCount = 0;
    }
    
    GenTree* ctrlExpr = call->gtControlExpr;
    if (call->gtCallType == CT_INDIRECT)
    {
        // either gtControlExpr != null or gtCallAddr != null.
        // Both cannot be non-null at the same time.
        assert(ctrlExpr == nullptr);
        assert(call->gtCallAddr != nullptr);
        ctrlExpr = call->gtCallAddr;
    }

    // set reg requirements on call target represented as control sequence.
    if (ctrlExpr != nullptr)
    {
        // we should never see a gtControlExpr whose type is void.
        assert(ctrlExpr->TypeGet() != TYP_VOID);                

        // call can take a Rm op on x64
        info->srcCount++;               

        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
        // computed into a register.
        if (!call->IsFastTailCall())
        {
            if (ctrlExpr->isIndir())
            {
                MakeSrcContained(call, ctrlExpr);
            }
        }
        else
        {
            // Fast tail call - make sure that call target is always computed in RAX
            // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
            ctrlExpr->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
        }
    }

    // If this is a varargs call, we will clear the internal candidates in case we need
    // to reserve some integer registers for copying float args.
    // We have to do this because otherwise the default candidates are allRegs, and adding
    // the individual specific registers will have no effect.
    if (call->IsVarargs())
    {
        info->setInternalCandidates(l, RBM_NONE);
    }

    RegisterType registerType = call->TypeGet();

    // Set destination candidates for return value of the call.
#ifdef _TARGET_X86_
    if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
    {
        // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
        // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
        // correct argument registers.
        info->setDstCandidates(l, RBM_PINVOKE_TCB);
    }
    else
#endif // _TARGET_X86_
    if (hasMultiRegRetVal)
    {
        assert(retTypeDesc != nullptr);
        info->setDstCandidates(l, retTypeDesc->GetABIReturnRegs());
    }
    else if (varTypeIsFloating(registerType))
    {
#ifdef _TARGET_X86_
        // The return value will be on the X87 stack, and we will need to move it.
        info->setDstCandidates(l, l->allRegs(registerType));
#else // !_TARGET_X86_
        info->setDstCandidates(l, RBM_FLOATRET);
#endif // !_TARGET_X86_
    }
    else if (registerType == TYP_LONG)
    {
        info->setDstCandidates(l, RBM_LNGRET);
    }
    else
    {
        info->setDstCandidates(l, RBM_INTRET);
    }

    // number of args to a call = 
    // callRegArgs + (callargs - placeholders, setup, etc)
    // there is an explicit thisPtr but it is redundant 

    // If there is an explicit this pointer, we don't want that node to produce anything
    // as it is redundant
    if (call->gtCallObjp != nullptr)
    {
        GenTreePtr thisPtrNode = call->gtCallObjp;

        if (thisPtrNode->gtOper == GT_PUTARG_REG)
        {
            l->clearOperandCounts(thisPtrNode);
            l->clearDstCount(thisPtrNode->gtOp.gtOp1);
        }
        else
        {
            l->clearDstCount(thisPtrNode);
        }
    }

    // First, count reg args
#if FEATURE_VARARG
    bool callHasFloatRegArgs = false;
#endif // !FEATURE_VARARG
    
    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
    {
        assert(list->IsList());

        GenTreePtr argNode = list->Current();

        fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
        assert(curArgTabEntry);

        if (curArgTabEntry->regNum == REG_STK)
        {
            // late arg that is not passed in a register
            DISPNODE(argNode);
            assert(argNode->gtOper == GT_PUTARG_STK);
            argNode->gtLsraInfo.srcCount = 1;
            argNode->gtLsraInfo.dstCount = 0;

#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
            // If the node is TYP_STRUCT and it is put on stack with
            // putarg_stk operation, we consume and produce no registers.
            // In this case the embedded Obj node should not produce 
            // registers too since it is contained.
            // Note that if it is a SIMD type the argument will be in a register.
            if (argNode->TypeGet() == TYP_STRUCT)
            {
                assert(argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_OBJ);
                argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0;
                argNode->gtLsraInfo.srcCount = 0;
            }
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
            continue;
        }

        regNumber argReg = REG_NA;
        regMaskTP argMask = RBM_NONE;
        short regCount = 0;
        bool isOnStack = true;
        if (curArgTabEntry->regNum != REG_STK)
        {
            isOnStack = false;
            var_types argType = argNode->TypeGet();

#if FEATURE_VARARG
            callHasFloatRegArgs |= varTypeIsFloating(argType);
#endif // !FEATURE_VARARG

            argReg = curArgTabEntry->regNum;
            regCount = 1;

            // Default case is that we consume one source; modify this later (e.g. for
            // promoted structs)
            info->srcCount++;

            argMask = genRegMask(argReg);
            argNode = argNode->gtEffectiveVal();
        }

        // If the struct arg is wrapped in CPYBLK the type of the param will be TYP_VOID.
        // Use the curArgTabEntry's isStruct to get whether the param is a struct.
        if (varTypeIsStruct(argNode)
            FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(|| curArgTabEntry->isStruct))
        {
            unsigned originalSize = 0;
            LclVarDsc* varDsc = nullptr;
            if (argNode->gtOper == GT_LCL_VAR)
            {
                varDsc = compiler->lvaTable + argNode->gtLclVarCommon.gtLclNum;
                originalSize = varDsc->lvSize();
            }
            else if (argNode->gtOper == GT_MKREFANY)
            {
                originalSize = 2 * TARGET_POINTER_SIZE;
            }
            else if (argNode->gtOper == GT_OBJ)
            {
                noway_assert(!"GT_OBJ not supported for amd64");
            }
#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
            else if (argNode->gtOper == GT_PUTARG_REG)
            {
                originalSize = genTypeSize(argNode->gtType);
            }
            else if (argNode->gtOper == GT_LIST)
            {
                originalSize = 0;

                // There could be up to 2 PUTARG_REGs in the list
                GenTreeArgList* argListPtr = argNode->AsArgList();
                unsigned iterationNum = 0;
                for (; argListPtr; argListPtr = argListPtr->Rest())
                {
                    GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1;
                    assert(putArgRegNode->gtOper == GT_PUTARG_REG);

                    if (iterationNum == 0)
                    {
                        varDsc = compiler->lvaTable + putArgRegNode->gtOp.gtOp1->gtLclVarCommon.gtLclNum;
                        originalSize = varDsc->lvSize();
                        assert(originalSize != 0);
                    }
                    else
                    {
                        // Need an extra source for every node, but the first in the list.
                        info->srcCount++;

                        // Get the mask for the second putarg_reg
                        argMask = genRegMask(curArgTabEntry->otherRegNum);
                    }

                    putArgRegNode->gtLsraInfo.setDstCandidates(l, argMask);
                    putArgRegNode->gtLsraInfo.setSrcCandidates(l, argMask);

                    // To avoid redundant moves, have the argument child tree computed in the
                    // register in which the argument is passed to the call.
                    putArgRegNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(putArgRegNode));
                    iterationNum++;
                }

                assert(iterationNum <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
            }
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
            else
            {
                noway_assert(!"Can't predict unsupported TYP_STRUCT arg kind");
            }

            unsigned slots = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES; 
            unsigned remainingSlots = slots;

            if (!isOnStack)
            {
                remainingSlots = slots - 1;

                regNumber reg = (regNumber)(argReg + 1);
                while (remainingSlots > 0 && reg <= REG_ARG_LAST)
                {
                    argMask |= genRegMask(reg);
                    reg = (regNumber)(reg + 1);
                    remainingSlots--;
                    regCount++;
                }
            }

            short internalIntCount = 0;
            if (remainingSlots > 0)
            {
                // This TYP_STRUCT argument is also passed in the outgoing argument area
                // We need a register to address the TYP_STRUCT
                // And we may need 2
#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
                internalIntCount = 1;
#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
                internalIntCount = 2;
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
            }
            argNode->gtLsraInfo.internalIntCount = internalIntCount;

#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
            if (argNode->gtOper == GT_PUTARG_REG)
            {
                argNode->gtLsraInfo.setDstCandidates(l, argMask);
                argNode->gtLsraInfo.setSrcCandidates(l, argMask);
            }
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
        }
        else
        {
            argNode->gtLsraInfo.setDstCandidates(l, argMask);
            argNode->gtLsraInfo.setSrcCandidates(l, argMask);
        }

        // To avoid redundant moves, have the argument child tree computed in the
        // register in which the argument is passed to the call.
        if (argNode->gtOper == GT_PUTARG_REG)
        {
            argNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(argNode));
        }

#if FEATURE_VARARG
        // In the case of a varargs call, the ABI dictates that if we have floating point args,
        // we must pass the enregistered arguments in both the integer and floating point registers.
        // Since the integer register is not associated with this arg node, we will reserve it as
        // an internal register so that it is not used during the evaluation of the call node
        // (e.g. for the target).
        if (call->IsVarargs() && varTypeIsFloating(argNode))
        {
            regNumber targetReg = compiler->getCallArgIntRegister(argReg);
            info->setInternalIntCount(info->internalIntCount + 1);
            info->addInternalCandidates(l, genRegMask(targetReg));
        }
#endif // FEATURE_VARARG
    }

    // Now, count stack args
    // Note that these need to be computed into a register, but then
    // they're just stored to the stack - so the reg doesn't
    // need to remain live until the call.  In fact, it must not
    // because the code generator doesn't actually consider it live,
    // so it can't be spilled.

    GenTreePtr args = call->gtCallArgs;
    while (args)
    {
        GenTreePtr arg = args->gtOp.gtOp1;
        if (!(args->gtFlags & GTF_LATE_ARG))
        {                    
            TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
#if !defined(_TARGET_64BIT_)
            if (arg->TypeGet() == TYP_LONG)
            {
                assert(arg->OperGet() == GT_LONG);
                GenTreePtr loArg = arg->gtGetOp1();
                GenTreePtr hiArg = arg->gtGetOp2();
                assert((loArg->OperGet() == GT_PUTARG_STK) && (hiArg->OperGet() == GT_PUTARG_STK));
                assert((loArg->gtLsraInfo.dstCount == 1) && (hiArg->gtLsraInfo.dstCount == 1));
                loArg->gtLsraInfo.isLocalDefUse = true;
                hiArg->gtLsraInfo.isLocalDefUse = true;
            }
            else
#endif // !defined(_TARGET_64BIT_)
            {
                if (argInfo->dstCount != 0)
                {
                    argInfo->isLocalDefUse = true;
                }

                // If the child of GT_PUTARG_STK is a constant, we don't need a register to 
                // move it to memory (stack location).
                // We don't want to make 0 contained, because we can generate smaller code
                // by zeroing a register and then storing it.
                argInfo->dstCount = 0;
                if (arg->gtOper == GT_PUTARG_STK) 
                {
                    GenTree* op1 = arg->gtOp.gtOp1;
                    if (IsContainableImmed(arg, op1) && !op1->IsIntegralConst(0))
                    {
                        MakeSrcContained(arg, op1);
                    }
                }
            }
        }
        args = args->gtOp.gtOp2;
    }

#if FEATURE_VARARG
    // If it is a fast tail call, it is already preferenced to use RAX.
    // Therefore, no need set src candidates on call tgt again.
    if (call->IsVarargs() && 
        callHasFloatRegArgs &&                 
        !call->IsFastTailCall() &&
        (ctrlExpr != nullptr))
    {
        // Don't assign the call target to any of the argument registers because
        // we will use them to also pass floating point arguments as required
        // by Amd64 ABI.
        ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS));
    }
#endif // !FEATURE_VARARG
}


//------------------------------------------------------------------------
// TreeNodeInfoInitBlockStore: Set the NodeInfo for a block store.
//
// Arguments:
//    blkNode       - The block store node of interest
//
// Return Value:
//    None.
//
void
Lowering::TreeNodeInfoInitBlockStore(GenTreeBlkOp* blkNode)
{
    GenTree*  dstAddr = blkNode->Dest();
    unsigned  size;
    LinearScan* l = m_lsra;
    Compiler* compiler = comp;

    // Sources are dest address, initVal or source, and size
    blkNode->gtLsraInfo.srcCount = 3;
    blkNode->gtLsraInfo.dstCount = 0;

    if (blkNode->OperGet() == GT_INITBLK)
    {
        GenTreeInitBlk* initBlkNode = blkNode->AsInitBlk();

        GenTreePtr blockSize = initBlkNode->Size();
        GenTreePtr   initVal = initBlkNode->InitVal();

        // If we have an InitBlk with constant block size we can optimize several ways:
        // a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes 
        //    we use rep stosb since this reduces the register pressure in LSRA and we have
        //    roughly the same performance as calling the helper.
        // b) If the size is <= INITBLK_UNROLL_LIMIT bytes and the fill byte is a constant, 
        //    we can speed this up by unrolling the loop using SSE2 stores.  The reason for
        //    this threshold is because our last investigation (Fall 2013), more than 95% of initblks 
        //    in our framework assemblies are actually <= INITBLK_UNROLL_LIMIT bytes size, so this is the
        //    preferred code sequence for the vast majority of cases.

        // This threshold will decide from using the helper or let the JIT decide to inline
        // a code sequence of its choice.
        ssize_t helperThreshold = max(INITBLK_STOS_LIMIT, INITBLK_UNROLL_LIMIT);

        // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
        if (blockSize->IsCnsIntOrI() && blockSize->gtIntCon.gtIconVal <= helperThreshold)
        {
            ssize_t size = blockSize->gtIntCon.gtIconVal;

            // Always favor unrolling vs rep stos.
            if (size <= INITBLK_UNROLL_LIMIT && initVal->IsCnsIntOrI())
            {
                // The fill value of an initblk is interpreted to hold a
                // value of (unsigned int8) however a constant of any size
                // may practically reside on the evaluation stack. So extract
                // the lower byte out of the initVal constant and replicate
                // it to a larger constant whose size is sufficient to support
                // the largest width store of the desired inline expansion.

                ssize_t fill = initVal->gtIntCon.gtIconVal & 0xFF;
#ifdef _TARGET_AMD64_
                if (size < REGSIZE_BYTES)
                {
                    initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
                }
                else
                {
                    initVal->gtIntCon.gtIconVal = 0x0101010101010101LL * fill;
                    initVal->gtType = TYP_LONG;
                }
#else // !_TARGET_AMD64_
                initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
#endif // !_TARGET_AMD64_

                MakeSrcContained(blkNode, blockSize);

                // In case we have a buffer >= 16 bytes
                // we can use SSE2 to do a 128-bit store in a single
                // instruction.
                if (size >= XMM_REGSIZE_BYTES)
                {
                    // Reserve an XMM register to fill it with 
                    // a pack of 16 init value constants.
                    blkNode->gtLsraInfo.internalFloatCount = 1;
                    blkNode->gtLsraInfo.setInternalCandidates(l, l->internalFloatRegCandidates());
                }
                initBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindUnroll;
            }
            else
            {
                // rep stos has the following register requirements:
                // a) The memory address to be in RDI.
                // b) The fill value has to be in RAX.
                // c) The buffer size must be in RCX.
                dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_RDI);
                initVal->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
                blockSize->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
                initBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindRepInstr;
            }
        }
        else
        {
#ifdef _TARGET_AMD64_
            // The helper follows the regular AMD64 ABI.
            dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_ARG_0);
            initVal->gtLsraInfo.setSrcCandidates(l, RBM_ARG_1);
            blockSize->gtLsraInfo.setSrcCandidates(l, RBM_ARG_2);
            initBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindHelper;
#else // !_TARGET_AMD64_
            dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_RDI);
            initVal->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
            blockSize->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
            initBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindRepInstr;
#endif // !_TARGET_AMD64_
        }
    }
    else if (blkNode->OperGet() == GT_COPYOBJ)
    {
        GenTreeCpObj* cpObjNode = blkNode->AsCpObj();
            
        GenTreePtr  clsTok = cpObjNode->ClsTok();
        GenTreePtr srcAddr = cpObjNode->Source();
            
        unsigned slots = cpObjNode->gtSlots;

#ifdef DEBUG
        // CpObj must always have at least one GC-Pointer as a member.
        assert(cpObjNode->gtGcPtrCount > 0);
            
        assert(dstAddr->gtType == TYP_BYREF || dstAddr->gtType == TYP_I_IMPL);
        assert(clsTok->IsIconHandle());
            
        CORINFO_CLASS_HANDLE clsHnd = (CORINFO_CLASS_HANDLE)clsTok->gtIntCon.gtIconVal;
        size_t classSize = compiler->info.compCompHnd->getClassSize(clsHnd);
        size_t blkSize = roundUp(classSize, TARGET_POINTER_SIZE);
            
        // Currently, the EE always round up a class data structure so 
        // we are not handling the case where we have a non multiple of pointer sized 
        // struct. This behavior may change in the future so in order to keeps things correct
        // let's assert it just to be safe. Going forward we should simply
        // handle this case.
        assert(classSize == blkSize);
        assert((blkSize / TARGET_POINTER_SIZE) == slots);
        assert(cpObjNode->HasGCPtr());
#endif

        bool IsRepMovsProfitable = false;
            
        // If the destination is not on the stack, let's find out if we
        // can improve code size by using rep movsq instead of generating
        // sequences of movsq instructions.
        if (!dstAddr->OperIsLocalAddr())
        {
            // Let's inspect the struct/class layout and determine if it's profitable
            // to use rep movsq for copying non-gc memory instead of using single movsq 
            // instructions for each memory slot.
            unsigned i = 0;
            BYTE* gcPtrs = cpObjNode->gtGcPtrs;
                
            do {
                unsigned nonGCSlots = 0;
                // Measure a contiguous non-gc area inside the struct and note the maximum.
                while (i < slots && gcPtrs[i] == TYPE_GC_NONE)
                {
                    nonGCSlots++;
                    i++;
                }
                    
                while (i < slots && gcPtrs[i] != TYPE_GC_NONE)
                {
                    i++;
                }

                if (nonGCSlots >= CPOBJ_NONGC_SLOTS_LIMIT)
                {
                    IsRepMovsProfitable = true;
                    break;
                }
            } while (i < slots);
        }
        else if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
        {
            IsRepMovsProfitable = true;
        }

        // There are two cases in which we need to materialize the 
        // struct size:
        // a) When the destination is on the stack we don't need to use the 
        //    write barrier, we can just simply call rep movsq and get a win in codesize.
        // b) If we determine we have contiguous non-gc regions in the struct where it's profitable
        //    to use rep movsq instead of a sequence of single movsq instructions.  According to the
        //    Intel Manual, the sweet spot for small structs is between 4 to 12 slots of size where
        //    the entire operation takes 20 cycles and encodes in 5 bytes (moving RCX, and calling rep movsq).
        if (IsRepMovsProfitable)
        {
            // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
            MakeSrcContained(blkNode, clsTok);
            blkNode->gtLsraInfo.internalIntCount = 1;
            blkNode->gtLsraInfo.setInternalCandidates(l, RBM_RCX);
        }
        else
        {
            // We don't need to materialize the struct size because we will unroll
            // the loop using movsq that automatically increments the pointers.
            MakeSrcContained(blkNode, clsTok);
        }

        dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_RDI);
        srcAddr->gtLsraInfo.setSrcCandidates(l, RBM_RSI);
    }
    else
    {
        assert(blkNode->OperGet() == GT_COPYBLK);
        GenTreeCpBlk* cpBlkNode = blkNode->AsCpBlk();

        GenTreePtr blockSize = cpBlkNode->Size();
        GenTreePtr   srcAddr = cpBlkNode->Source();

        // In case of a CpBlk with a constant size and less than CPBLK_MOVS_LIMIT size
        // we can use rep movs to generate code instead of the helper call.

        // This threshold will decide from using the helper or let the JIT decide to inline
        // a code sequence of its choice.
        ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);

        // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
        if (blockSize->IsCnsIntOrI() && blockSize->gtIntCon.gtIconVal <= helperThreshold)
        {
            assert(!blockSize->IsIconHandle());
            ssize_t size = blockSize->gtIntCon.gtIconVal;

            // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2. 
            // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
            // our framework assemblies, so this is the main code generation scheme we'll use.
            if (size <= CPBLK_UNROLL_LIMIT)
            {
                MakeSrcContained(blkNode, blockSize);
                    
                // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
                // 
                // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
                // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
                // RBM_NON_BYTE_REGS from internal candidates.
                if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
                {
                    blkNode->gtLsraInfo.internalIntCount++;
                    regMaskTP regMask = l->allRegs(TYP_INT);

#ifdef _TARGET_X86_
                    if ((size % 2) != 0)
                    {
                        regMask &= ~RBM_NON_BYTE_REGS;
                    }
#endif
                    blkNode->gtLsraInfo.setInternalCandidates(l, regMask);
                }

                if (size >= XMM_REGSIZE_BYTES)
                {
                    // If we have a buffer larger than XMM_REGSIZE_BYTES, 
                    // reserve an XMM register to use it for a 
                    // series of 16-byte loads and stores.
                    blkNode->gtLsraInfo.internalFloatCount = 1;
                    blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
                }

                // If src or dst are on stack, we don't have to generate the address into a register
                // because it's just some constant+SP
                if (srcAddr->OperIsLocalAddr())
                {
                    MakeSrcContained(blkNode, srcAddr);
                }

                if (dstAddr->OperIsLocalAddr())
                {
                    MakeSrcContained(blkNode, dstAddr);
                }
                            
                cpBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindUnroll;
            }
            else
            {
                dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_RDI);
                srcAddr->gtLsraInfo.setSrcCandidates(l, RBM_RSI);
                blockSize->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
                cpBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindRepInstr;
            }
        }
#ifdef _TARGET_AMD64_
        else
        {
            // In case we have a constant integer this means we went beyond
            // CPBLK_MOVS_LIMIT bytes of size, still we should never have the case of
            // any GC-Pointers in the src struct.
            if (blockSize->IsCnsIntOrI())
            {
                assert(!blockSize->IsIconHandle());
            }

            dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_ARG_0);
            srcAddr->gtLsraInfo.setSrcCandidates(l, RBM_ARG_1);
            blockSize->gtLsraInfo.setSrcCandidates(l, RBM_ARG_2);
            cpBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindHelper;
        }
#elif defined(_TARGET_X86_)
        else
        {
            dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_RDI);
            srcAddr->gtLsraInfo.setSrcCandidates(l, RBM_RSI);
            blockSize->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
            cpBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindRepInstr;
        }
#endif // _TARGET_X86_
    }
}


#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
//------------------------------------------------------------------------
// TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
//
// Arguments:
//    tree      - The node of interest
//
// Return Value:
//    None.
//
void
Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree)
{
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    LinearScan* l = m_lsra;

    if (tree->TypeGet() != TYP_STRUCT)
    {
        TreeNodeInfoInitSimple(tree);
        return;
    }

    GenTreePutArgStk* putArgStkTree = tree->AsPutArgStk();
    
    GenTreePtr   dst = tree;
    GenTreePtr   src = tree->gtOp.gtOp1;
    GenTreePtr   srcAddr = nullptr;

    if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
    {
        srcAddr = src->gtOp.gtOp1;
    }
    else
    {
        assert(varTypeIsSIMD(tree));
    }
    info->srcCount = src->gtLsraInfo.dstCount;

    // If this is a stack variable address,
    // make the op1 contained, so this way 
    // there is no unnecessary copying between registers.
    // To avoid assertion, increment the parent's source.
    // It is recovered below.
    bool haveLocalAddr = ((srcAddr != nullptr) && (srcAddr->OperIsLocalAddr()));
    if (haveLocalAddr)
    {
        info->srcCount += 1;
    }
    
    info->dstCount = 0;
    
    // In case of a CpBlk we could use a helper call. In case of putarg_stk we 
    // can't do that since the helper call could kill some already set up outgoing args.
    // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
    // The cpyXXXX code is rather complex and this could cause it to be more complex, but
    // it might be the right thing to do.
    
    // This threshold will decide from using the helper or let the JIT decide to inline
    // a code sequence of its choice.
    ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
    ssize_t size = putArgStkTree->gtNumSlots * TARGET_POINTER_SIZE;
    
    // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
    // (I don't know which).
    
    // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2. 
    // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
    // our framework assemblies, so this is the main code generation scheme we'll use.
    if (size <= CPBLK_UNROLL_LIMIT && putArgStkTree->gtNumberReferenceSlots == 0)
    {
        // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
        // 
        // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
        // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
        // RBM_NON_BYTE_REGS from internal candidates.
        if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
        {
            info->internalIntCount++;
            regMaskTP regMask = l->allRegs(TYP_INT);
            
#ifdef _TARGET_X86_
            if ((size % 2) != 0)
            {
                regMask &= ~RBM_NON_BYTE_REGS;
            }
#endif
            info->setInternalCandidates(l, regMask);
        }
        
        if (size >= XMM_REGSIZE_BYTES)
        {
            // If we have a buffer larger than XMM_REGSIZE_BYTES, 
            // reserve an XMM register to use it for a 
            // series of 16-byte loads and stores.
            info->internalFloatCount = 1;
            info->addInternalCandidates(l, l->internalFloatRegCandidates());
        }
        
        if (haveLocalAddr)
        {
            MakeSrcContained(putArgStkTree, srcAddr);
        }
        
        // If src or dst are on stack, we don't have to generate the address into a register
        // because it's just some constant+SP
        putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindUnroll;
    }
    else
    {
        info->internalIntCount += 3;
        info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI));
        if (haveLocalAddr)
        {
            MakeSrcContained(putArgStkTree, srcAddr);
        }
        
        putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindRepInstr;
    }
    
    // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
    MakeSrcContained(putArgStkTree, src);
    
    // Balance up the inc above.
    if (haveLocalAddr)
    {
        info->srcCount -= 1;
    }
}
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING


//------------------------------------------------------------------------
// TreeNodeInfoInitLclHeap: Set the NodeInfo for a GT_LCLHEAP.
//
// Arguments:
//    tree      - The node of interest
//
// Return Value:
//    None.
//
void
Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
{
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    LinearScan* l = m_lsra;
    Compiler* compiler = comp;

    info->srcCount = 1;
    info->dstCount = 1;

    // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
    // Here '-' means don't care.
    //
    //     Size?                    Init Memory?         # temp regs
    //      0                            -                  0
    //      const and <=6 reg words      -                  0
    //      const and >6 reg words       Yes                0
    //      const and <PageSize          No                 0 (amd64) 1 (x86)
    //      const and >=PageSize         No                 2
    //      Non-const                    Yes                0
    //      Non-const                    No                 2            

    GenTreePtr size = tree->gtOp.gtOp1;
    if (size->IsCnsIntOrI())
    {
        MakeSrcContained(tree, size);

        size_t sizeVal = size->gtIntCon.gtIconVal;

        if (sizeVal == 0)
        {
            info->internalIntCount = 0;
        }
        else 
        {
            // Compute the amount of memory to properly STACK_ALIGN.
            // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
            // This should also help in debugging as we can examine the original size specified with localloc.
            sizeVal = AlignUp(sizeVal, STACK_ALIGN);

            // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
            // we will generate 'push 0'.
            assert((sizeVal % REGSIZE_BYTES) == 0);
            size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
            if (cntRegSizedWords <= 6)
            {
                info->internalIntCount = 0;
            }
            else if (!compiler->info.compInitMem)
            {
                // No need to initialize allocated stack space.
                if (sizeVal < compiler->eeGetPageSize())
                {
#ifdef _TARGET_X86_
                    info->internalIntCount = 1;     // x86 needs a register here to avoid generating "sub" on ESP.
#else // !_TARGET_X86_
                    info->internalIntCount = 0;
#endif // !_TARGET_X86_
                }
                else
                {
                    // We need two registers: regCnt and RegTmp
                    info->internalIntCount = 2;
                }
            }
            else
            {
                // >6 and need to zero initialize allocated stack space.
                info->internalIntCount = 0;
            }
        }
    }
    else
    {
        if (!compiler->info.compInitMem)
        {
            info->internalIntCount = 2;
        }
        else
        {
            info->internalIntCount = 0;
        }
    }
}


//------------------------------------------------------------------------
// TreeNodeInfoInitLogicalOp: Set the NodeInfo for GT_AND/GT_OR/GT_XOR,
// as well as GT_ADD/GT_SUB.
//
// Arguments:
//    tree      - The node of interest
//
// Return Value:
//    None.
//
void
Lowering::TreeNodeInfoInitLogicalOp(GenTree* tree)
{
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    LinearScan* l = m_lsra;

    // We're not marking a constant hanging on the left of the add
    // as containable so we assign it to a register having CQ impact.
    // TODO-XArch-CQ: Detect this case and support both generating a single instruction 
    // for GT_ADD(Constant, SomeTree)
    info->srcCount = 2;
    info->dstCount = 1;

    GenTree* op1 = tree->gtGetOp1();
    GenTree* op2 = tree->gtGetOp2();            

    // We can directly encode the second operand if it is either a containable constant or a memory-op.
    // In case of memory-op, we can encode it directly provided its type matches with 'tree' type.
    // This is because during codegen, type of 'tree' is used to determine emit Type size. If the types
    // do not match, they get normalized (i.e. sign/zero extended) on load into a register.
    bool directlyEncodable = false;
    bool binOpInRMW = false;
    GenTreePtr operand = nullptr;

    if (IsContainableImmed(tree, op2))
    {
        directlyEncodable = true;
        operand = op2;
    }
    else
    {
        binOpInRMW = IsBinOpInRMWStoreInd(tree);
        if (!binOpInRMW)
        {
            if (op2->isMemoryOp() && tree->TypeGet() == op2->TypeGet())
            {
                directlyEncodable = true;
                operand = op2;
            }
            else if (tree->OperIsCommutative())
            {
                if (IsContainableImmed(tree, op1) ||
                    (op1->isMemoryOp() && tree->TypeGet() == op1->TypeGet() && IsSafeToContainMem(tree, op1)))
                {
                    // If it is safe, we can reverse the order of operands of commutative operations for efficient codegen
                    directlyEncodable = true;
                    operand = op1;
                }
            }
        }
    }

    if (directlyEncodable)
    {
        assert(operand != nullptr);
        MakeSrcContained(tree, operand);
    }
    else if (!binOpInRMW)
    {
        // If this binary op neither has contained operands, nor is a 
        // Read-Modify-Write (RMW) operation, we can mark its operands
        // as reg optional.
        SetRegOptionalForBinOp(tree);
    }
}


//------------------------------------------------------------------------
// TreeNodeInfoInitModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
//
// Arguments:
//    tree      - The node of interest
//
// Return Value:
//    None.
//
void
Lowering::TreeNodeInfoInitModDiv(GenTree* tree)
{
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    LinearScan* l = m_lsra;

    GenTree* op1 = tree->gtGetOp1();
    GenTree* op2 = tree->gtGetOp2();

    info->srcCount = 2;
    info->dstCount = 1;

    switch (tree->OperGet())
    {
    case GT_MOD:
    case GT_DIV:
        if (varTypeIsFloating(tree->TypeGet()))
        {   
            // No implicit conversions at this stage as the expectation is that
            // everything is made explicit by adding casts.
            assert(op1->TypeGet() == op2->TypeGet());

            if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
            {
                MakeSrcContained(tree, op2);
            }
            else
            {
                // If there are no containable operands, we can make an operand reg optional.
                SetRegOptionalForBinOp(tree);
            }

            return;
        }
        break;

    default:
        break;
    }

    // Amd64 Div/Idiv instruction: 
    //    Dividend in RAX:RDX  and computes
    //    Quotient in RAX, Remainder in RDX

    if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
    {
        // We are interested in just the remainder.
        // RAX is used as a trashable register during computation of remainder.
        info->setDstCandidates(l, RBM_RDX);
    }
    else 
    {
        // We are interested in just the quotient.
        // RDX gets used as trashable register during computation of quotient
        info->setDstCandidates(l, RBM_RAX);            
    }

    // If possible would like to have op1 in RAX to avoid a register move
    op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);

    // divisor can be an r/m, but the memory indirection must be of the same size as the divide
    if (op2->isMemoryOp() && (op2->TypeGet() == tree->TypeGet()))
    {
        MakeSrcContained(tree, op2);
    }
    else
    {
        op2->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));

        // If there are no containable operands, we can make an operand reg optional.
        SetRegOptionalForBinOp(tree);
    }
}


//------------------------------------------------------------------------
// TreeNodeInfoInitIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
//
// Arguments:
//    tree      - The node of interest
//
// Return Value:
//    None.
//
void
Lowering::TreeNodeInfoInitIntrinsic(GenTree* tree)
{
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    LinearScan* l = m_lsra;

    // Both operand and its result must be of floating point type.
    GenTree* op1 = tree->gtGetOp1();
    assert(varTypeIsFloating(op1));
    assert(op1->TypeGet() == tree->TypeGet());

    info->srcCount = 1;
    info->dstCount = 1;

    switch(tree->gtIntrinsic.gtIntrinsicId)
    {
    case CORINFO_INTRINSIC_Sqrt:
        if (op1->isMemoryOp() || op1->IsCnsNonZeroFltOrDbl())
        {
            MakeSrcContained(tree, op1);
        }
        else
        {
            // Mark the operand as reg optional since codegen can still 
            // generate code if op1 is on stack.
            TryToSetRegOptional(op1);
        }
        break;

     case CORINFO_INTRINSIC_Abs:
        // Abs(float x) = x & 0x7fffffff
        // Abs(double x) = x & 0x7ffffff ffffffff

        // In case of Abs we need an internal register to hold mask.
         
        // TODO-XArch-CQ: avoid using an internal register for the mask.
        // Andps or andpd both will operate on 128-bit operands.  
        // The data section constant to hold the mask is a 64-bit size.
        // Therefore, we need both the operand and mask to be in 
        // xmm register. When we add support in emitter to emit 128-bit
        // data constants and instructions that operate on 128-bit
        // memory operands we can avoid the need for an internal register.
        if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
        {
            info->internalFloatCount = 1;
            info->setInternalCandidates(l, l->internalFloatRegCandidates());
        }
        break;

#ifdef _TARGET_X86_
    case CORINFO_INTRINSIC_Cos:                
    case CORINFO_INTRINSIC_Sin:                
    case CORINFO_INTRINSIC_Round:
        NYI_X86("Math intrinsics Cos, Sin and Round");
        break;
#endif // _TARGET_X86_

    default:
        // Right now only Sqrt/Abs are treated as math intrinsics
        noway_assert(!"Unsupported math intrinsic");
        unreached();
        break;
    }
}

#ifdef FEATURE_SIMD
//------------------------------------------------------------------------
// TreeNodeInfoInitSIMD: Set the NodeInfo for a GT_SIMD tree.
//
// Arguments:
//    tree       - The GT_SIMD node of interest
//
// Return Value:
//    None.

void
Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
{
    GenTreeSIMD* simdTree = tree->AsSIMD();
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    LinearScan* lsra = m_lsra;
    info->dstCount = 1;
    switch(simdTree->gtSIMDIntrinsicID)
    {
        GenTree* op2;

    case SIMDIntrinsicInit:
        {
            info->srcCount = 1;
            GenTree* op1 = tree->gtOp.gtOp1;

            // This sets all fields of a SIMD struct to the given value.
            // Mark op1 as contained if it is either zero or int constant of all 1's,
            // or a float constant with 16 or 32 byte simdType (AVX case)
            //
            // Should never see small int base type vectors except for zero initialization.
            assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));

            if (op1->IsFPZero() ||
                op1->IsIntegralConst(0) || 
                (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
            {
                MakeSrcContained(tree, tree->gtOp.gtOp1);
                info->srcCount = 0;
            }            
            else if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) &&
                     ((simdTree->gtSIMDSize == 16) || (simdTree->gtSIMDSize == 32)))
            {
                // Either op1 is a float or dbl constant or an addr
                if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
                {
                    MakeSrcContained(tree, tree->gtOp.gtOp1);
                    info->srcCount = 0;
                }
            }
        }
        break;

    case SIMDIntrinsicInitN:
        {
            info->srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(simdTree->gtSIMDBaseType));

            // Need an internal register to stitch together all the values into a single vector in a SIMD reg
            info->internalFloatCount = 1;
            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
        }
        break;

    case SIMDIntrinsicInitArray:
        // We have an array and an index, which may be contained.
        info->srcCount = 2;
        CheckImmedAndMakeContained(tree,  tree->gtGetOp2());
        break;

    case SIMDIntrinsicDiv:
        // SSE2 has no instruction support for division on integer vectors
        noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
        info->srcCount = 2;
        break;

    case SIMDIntrinsicAbs:
        // This gets implemented as bitwise-And operation with a mask
        // and hence should never see it here.
        unreached();
        break;

    case SIMDIntrinsicSqrt:
        // SSE2 has no instruction support for sqrt on integer vectors.
        noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
        info->srcCount = 1;
        break;

    case SIMDIntrinsicAdd:
    case SIMDIntrinsicSub:
    case SIMDIntrinsicMul:    
    case SIMDIntrinsicBitwiseAnd:
    case SIMDIntrinsicBitwiseAndNot:
    case SIMDIntrinsicBitwiseOr:
    case SIMDIntrinsicBitwiseXor:
    case SIMDIntrinsicMin:
    case SIMDIntrinsicMax:
        info->srcCount = 2;

        // SSE2 32-bit integer multiplication requires two temp regs
        if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && 
            simdTree->gtSIMDBaseType == TYP_INT)
        {
            info->internalFloatCount = 2;
            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
        }
        break;

    case SIMDIntrinsicEqual:
        info->srcCount = 2;
        break;

    // SSE2 doesn't support < and <= directly on int vectors.
    // Instead we need to use > and >= with swapped operands.
    case SIMDIntrinsicLessThan:
    case SIMDIntrinsicLessThanOrEqual:
        info->srcCount = 2;
        noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
        break;

    // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
    // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
    // Instead we need to use <  and <= with swapped operands.
    case SIMDIntrinsicGreaterThan:
        noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
        info->srcCount = 2;
        break;

    case SIMDIntrinsicOpEquality:
    case SIMDIntrinsicOpInEquality:
        // Need two SIMD registers as scratch.
        // See genSIMDIntrinsicRelOp() for details on code sequence generate and
        // the need for two scratch registers.
        info->srcCount = 2;
        info->internalFloatCount = 2;
        info->setInternalCandidates(lsra, lsra->allSIMDRegs());
        break;

    case SIMDIntrinsicDotProduct:
        if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) || (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
        {
            // For SSE, or AVX with 32-byte vectors, we also need an internal register as scratch.
            // Further we need the targetReg and internal reg to be distinct registers.
            // This is achieved by requesting two internal registers; thus one of them
            // will be different from targetReg.
            // Note that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
            //
            // See genSIMDIntrinsicDotProduct() for details on code sequence generated and
            // the need for scratch registers.
            info->internalFloatCount = 2;
            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
        }
        info->srcCount = 2;
        break;

    case SIMDIntrinsicGetItem:
        // This implements get_Item method. The sources are:
        //  - the source SIMD struct
        //  - index (which element to get)
        // The result is baseType of SIMD struct.
        info->srcCount = 2;
        op2 = tree->gtOp.gtOp2;

        // If the index is a constant, mark it as contained.
        if (CheckImmedAndMakeContained(tree, op2))
        {
            info->srcCount = 1;
        }

        // If the index is not a constant, we will use the SIMD temp location to store the vector.
        // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
        // can use that in the process of extracting the element.
        // 
        // If the index is a constant and base type is a small int we can use pextrw, but on AVX
        // we will need a temp if are indexing into the upper half of the AVX register.
        // In all other cases with constant index, we need a temp xmm register to extract the 
        // element if index is other than zero.

        if (!op2->IsCnsIntOrI())
        {
            (void) comp->getSIMDInitTempVarNum();
        }
        else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
        {
            bool needFloatTemp;
            if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) && (comp->getSIMDInstructionSet() == InstructionSet_AVX))
            {
                int byteShiftCnt = (int) op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
                needFloatTemp = (byteShiftCnt >= 16);
            }
            else
            {
                needFloatTemp = !op2->IsIntegralConst(0);
            }
            if (needFloatTemp)
            {
                info->internalFloatCount = 1;
                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
            }
        }
        break;

    case SIMDIntrinsicSetX:
    case SIMDIntrinsicSetY:
    case SIMDIntrinsicSetZ:
    case SIMDIntrinsicSetW:
        // We need an internal integer register
        info->srcCount = 2;
        info->internalIntCount = 1;
        info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT));
        break;    

    case SIMDIntrinsicCast:
        info->srcCount = 1;
        break;

    case SIMDIntrinsicShuffleSSE2:
        info->srcCount = 2;
        // Second operand is an integer constant and marked as contained.
        op2 = tree->gtOp.gtOp2;
        noway_assert(op2->IsCnsIntOrI());
        MakeSrcContained(tree, op2);
        break;

    case SIMDIntrinsicGetX:
    case SIMDIntrinsicGetY:
    case SIMDIntrinsicGetZ:
    case SIMDIntrinsicGetW:
    case SIMDIntrinsicGetOne:
    case SIMDIntrinsicGetZero:
    case SIMDIntrinsicGetCount:
    case SIMDIntrinsicGetAllOnes:
        assert(!"Get intrinsics should not be seen during Lowering.");
        unreached();

    default:
        noway_assert(!"Unimplemented SIMD node type.");
        unreached();
    }
}
#endif // FEATURE_SIMD


//------------------------------------------------------------------------
// TreeNodeInfoInitCast: Set the NodeInfo for a GT_CAST.
//
// Arguments:
//    tree      - The node of interest
//
// Return Value:
//    None.
//
void
Lowering::TreeNodeInfoInitCast(GenTree* tree)
{
    TreeNodeInfo* info = &(tree->gtLsraInfo);

    // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
    //         see CodeGen::genIntToIntCast() 

    info->srcCount = 1;
    info->dstCount = 1;

    // Non-overflow casts to/from float/double are done using SSE2 instructions
    // and that allow the source operand to be either a reg or memop. Given the
    // fact that casts from small int to float/double are done as two-level casts, 
    // the source operand is always guaranteed to be of size 4 or 8 bytes.
    var_types castToType = tree->CastToType();
    GenTreePtr castOp    = tree->gtCast.CastOp();
    var_types castOpType = castOp->TypeGet();
    if (tree->gtFlags & GTF_UNSIGNED)
    {
        castOpType = genUnsignedType(castOpType);
    }

    if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType)))
    {
#ifdef DEBUG
        // If converting to float/double, the operand must be 4 or 8 byte in size.
        if (varTypeIsFloating(castToType))
        {
            unsigned opSize = genTypeSize(castOpType);
            assert(opSize == 4 || opSize == 8);
        }
#endif //DEBUG

        // U8 -> R8 conversion requires that the operand be in a register.
        if (castOpType != TYP_ULONG)
        {
            if (castOp->isMemoryOp() || castOp->IsCnsNonZeroFltOrDbl())
            {
                MakeSrcContained(tree, castOp);
            }
            else
            {
                // Mark castOp as reg optional to indicate codegen
                // can still generate code if it is on stack.
                TryToSetRegOptional(castOp);
            }
        }
    }

#if !defined(_TARGET_64BIT_)
    if (varTypeIsLong(castOpType))
    {
        noway_assert(castOp->OperGet() == GT_LONG);
        info->srcCount = 2;
    }
#endif // !defined(_TARGET_64BIT_)

    // some overflow checks need a temp reg:
    //  - GT_CAST from INT64/UINT64 to UINT32
    if (tree->gtOverflow() && (castToType == TYP_UINT))
    {
        if (genTypeSize(castOpType) == 8)
        {
            info->internalIntCount = 1;
        }
    }
}


void Lowering::LowerGCWriteBarrier(GenTree* tree)
{
    assert(tree->OperGet() == GT_STOREIND);

    GenTreeStoreInd* dst = tree->AsStoreInd();
    GenTreePtr addr = dst->Addr();
    GenTreePtr src  = dst->Data();

    if (addr->OperGet() == GT_LEA)
    {
        // In the case where we are doing a helper assignment, if the dst
        // is an indir through an lea, we need to actually instantiate the
        // lea in a register
        GenTreeAddrMode* lea = addr->AsAddrMode();

        int leaSrcCount = 0;
        if (lea->HasBase())
        {
            leaSrcCount++;
        }
        if (lea->HasIndex())
        {
            leaSrcCount++;
        }
        lea->gtLsraInfo.srcCount = leaSrcCount;
        lea->gtLsraInfo.dstCount = 1;
    }

    bool useOptimizedWriteBarrierHelper = false; // By default, assume no optimized write barriers.

#if NOGC_WRITE_BARRIERS

#if defined(_TARGET_X86_)

    useOptimizedWriteBarrierHelper = true;  // On x86, use the optimized write barriers by default.
#ifdef DEBUG
    GCInfo::WriteBarrierForm wbf = comp->codeGen->gcInfo.gcIsWriteBarrierCandidate(tree, src);
    if (wbf == GCInfo::WBF_NoBarrier_CheckNotHeapInDebug) // This one is always a call to a C++ method.
    {
        useOptimizedWriteBarrierHelper = false;
    }
#endif

    if (useOptimizedWriteBarrierHelper)
    {
        // Special write barrier:
        // op1 (addr) goes into REG_WRITE_BARRIER (rdx) and
        // op2 (src) goes into any int register.
        addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER);
        src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER_SRC);
    }

#else // !defined(_TARGET_X86_)
#error "NOGC_WRITE_BARRIERS is not supported"
#endif // !defined(_TARGET_X86_)

#endif // NOGC_WRITE_BARRIERS

    if (!useOptimizedWriteBarrierHelper)
    {
        // For the standard JIT Helper calls:
        // op1 (addr) goes into REG_ARG_0 and
        // op2 (src) goes into REG_ARG_1
        addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0);
        src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1);
    }

    // Both src and dst must reside in a register, which they should since we haven't set
    // either of them as contained.
    assert(addr->gtLsraInfo.dstCount == 1);
    assert(src->gtLsraInfo.dstCount == 1);
}

//-----------------------------------------------------------------------------------------
// Specify register requirements for address expression of an indirection operation.
//
// Arguments:
//    indirTree    -   GT_IND or GT_STOREIND gentree node
//
void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
{
    assert(indirTree->isIndir());

    GenTreePtr addr = indirTree->gtGetOp1();
    TreeNodeInfo* info = &(indirTree->gtLsraInfo);

    GenTreePtr base = nullptr;
    GenTreePtr index = nullptr;
    unsigned mul, cns;
    bool rev;
    bool modifiedSources = false;    

    // If indirTree is of TYP_SIMD12, don't mark addr as contained
    // so that it always get computed to a register.  This would
    // mean codegen side logic doesn't need to handle all possible
    // addr expressions that could be contained.
    // 
    // TODO-XArch-CQ: handle other addr mode expressions that could be marked
    // as contained.
#ifdef FEATURE_SIMD
    if (indirTree->TypeGet() == TYP_SIMD12)
    {
        // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
        // To assemble the vector properly we would need an additional
        // XMM register.
        info->internalFloatCount = 1;

        // In case of GT_IND we need an internal register different from targetReg and
        // both of the registers are used at the same time. This achieved by reserving
        // two internal registers
        if (indirTree->OperGet() == GT_IND)
        {
            (info->internalFloatCount)++;
        }

        info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());

        return ;
    }
#endif //FEATURE_SIMD

    // These nodes go into an addr mode:
    // - GT_CLS_VAR_ADDR turns into a constant.
    // - GT_LCL_VAR_ADDR is a stack addr mode.
    if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
    {
        // make this contained, it turns into a constant that goes into an addr mode
        MakeSrcContained(indirTree, addr);
    }
    else if (addr->IsCnsIntOrI() &&
             addr->AsIntConCommon()->FitsInAddrBase(comp) &&
             addr->gtLsraInfo.getDstCandidates(m_lsra) != RBM_VIRTUAL_STUB_PARAM)
    {
        // Amd64:
        // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
        // (i.e. those VSD calls for which stub addr is known during JIT compilation time).  In this case,
        // VM requires us to pass stub addr in REG_VIRTUAL_STUB_PARAM - see LowerVirtualStubCall().  For
        // that reason we cannot mark such an addr as contained.  Note that this is not an issue for
        // indirect VSD calls since morphArgs() is explicitly materializing hidden param as a non-standard
        // argument.
        //
        // Workaround:
        // Note that LowerVirtualStubCall() sets addr->gtRegNum to REG_VIRTUAL_STUB_PARAM and Lowering::doPhase()
        // sets destination candidates on such nodes and resets addr->gtRegNum to REG_NA before calling
        // TreeNodeInfoInit(). Ideally we should set a flag on addr nodes that shouldn't be marked as contained
        // (in LowerVirtualStubCall()), but we don't have any GTF_* flags left for that purpose.  As a workaround
        // an explicit check is made here.
        //
        // On x86, direct VSD is done via a relative branch, and in fact it MUST be contained.
        MakeSrcContained(indirTree, addr);
    }
    else if (addr->OperGet() == GT_LEA)
    {
        GenTreeAddrMode* lea = addr->AsAddrMode();
        base  = lea->Base();
        index = lea->Index();

        m_lsra->clearOperandCounts(addr);
        // The srcCount is decremented because addr is now "contained", 
        // then we account for the base and index below, if they are non-null.    
        info->srcCount--;
    }
    else if (comp->codeGen->genCreateAddrMode(addr, -1, true, 0, &rev, &base, &index, &mul, &cns, true /*nogen*/)
             && !(modifiedSources = AreSourcesPossiblyModified(indirTree, base, index)))
    {
        // An addressing mode will be constructed that may cause some
        // nodes to not need a register, and cause others' lifetimes to be extended
        // to the GT_IND or even its parent if it's an assignment

        assert(base != addr);
        m_lsra->clearOperandCounts(addr);

        GenTreePtr arrLength = nullptr;

        // Traverse the computation below GT_IND to find the operands
        // for the addressing mode, marking the various constants and
        // intermediate results as not consuming/producing.
        // If the traversal were more complex, we might consider using
        // a traversal function, but the addressing mode is only made
        // up of simple arithmetic operators, and the code generator
        // only traverses one leg of each node.

        bool foundBase = (base == nullptr);
        bool foundIndex = (index == nullptr);
        GenTreePtr nextChild = nullptr;
        for (GenTreePtr child = addr;
             child != nullptr && !child->OperIsLeaf();
             child = nextChild)
        {
            nextChild = nullptr;
            GenTreePtr op1 = child->gtOp.gtOp1;
            GenTreePtr op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr;

            if (op1 == base)
            {
                foundBase = true;
            }
            else if (op1 == index)
            {
                foundIndex = true;
            }
            else
            {
                m_lsra->clearOperandCounts(op1);
                if (!op1->OperIsLeaf())
                {
                    nextChild = op1;
                }
            }

            if (op2 != nullptr)
            {
                if (op2 == base)
                {
                    foundBase = true;
                }
                else if (op2 == index)
                {
                    foundIndex = true;
                }
                else
                {
                    m_lsra->clearOperandCounts(op2);
                    if (!op2->OperIsLeaf())
                    {
                        assert(nextChild == nullptr);
                        nextChild = op2;
                    }
                }
            }
        }
        assert(foundBase && foundIndex);
        info->srcCount--; // it gets incremented below.
    }
    else if (addr->gtOper == GT_ARR_ELEM)
    {
        // The GT_ARR_ELEM consumes all the indices and produces the offset.
        // The array object lives until the mem access.
        // We also consume the target register to which the address is
        // computed

        info->srcCount++;
        assert(addr->gtLsraInfo.srcCount >= 2);
        addr->gtLsraInfo.srcCount -= 1;
    }
    else
    {
        // it is nothing but a plain indir
        info->srcCount--; //base gets added in below
        base = addr;
    }

    if (base != nullptr)
    {
        info->srcCount++;
    }

    if (index != nullptr && !modifiedSources)
    {
        info->srcCount++;
    }
}
            

void Lowering::LowerCmp(GenTreePtr tree)
{
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    
    info->srcCount = 2;
    info->dstCount = 1;

#ifdef _TARGET_X86_
    info->setDstCandidates(m_lsra, RBM_BYTE_REGS);
#endif // _TARGET_X86_

    GenTreePtr op1 = tree->gtOp.gtOp1;
    GenTreePtr op2 = tree->gtOp.gtOp2;
    var_types op1Type = op1->TypeGet();
    var_types op2Type = op2->TypeGet();

#if !defined(_TARGET_64BIT_)
    // Long compares will consume GT_LONG nodes, each of which produces two results.
    // Thus for each long operand there will be an additional source.
    // TODO-X86-CQ: Mark hiOp2 and loOp2 as contained if it is a constant or a memory op.
    if (varTypeIsLong(op1Type))
    {
        info->srcCount++;
    }
    if (varTypeIsLong(op2Type))
    {
        info->srcCount++;
    }
#endif // !defined(_TARGET_64BIT_)

    // If either of op1 or op2 is floating point values, then we need to use
    // ucomiss or ucomisd to compare, both of which support the following form
    // ucomis[s|d] xmm, xmm/mem.  That is only the second operand can be a memory
    // op.  
    //
    // Second operand is a memory Op:  Note that depending on comparison operator,
    // the operands of ucomis[s|d] need to be reversed.  Therefore, either op1 or 
    // op2 can be a memory op depending on the comparison operator.
    if (varTypeIsFloating(op1Type))
    {        
        // The type of the operands has to be the same and no implicit conversions at this stage.
        assert(op1Type == op2Type);

        bool reverseOps;
        if ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0)
        {
            // Unordered comparison case
            reverseOps = (tree->gtOper == GT_GT || tree->gtOper == GT_GE);
        }
        else
        {
            reverseOps = (tree->gtOper == GT_LT || tree->gtOper == GT_LE);
        }

        GenTreePtr otherOp;
        if (reverseOps)
        {
            otherOp = op1;
        }
        else
        {
            otherOp = op2;
        }

        assert(otherOp != nullptr);
        if (otherOp->IsCnsNonZeroFltOrDbl())
        {
            MakeSrcContained(tree, otherOp);
        }
        else if (otherOp->isMemoryOp())
        {
            if ((otherOp == op2) || IsSafeToContainMem(tree, otherOp)) 
            {
                MakeSrcContained(tree, otherOp);
            }
        }
        else
        {
            // Mark otherOp as reg optional to indicate codgen can still generate
            // code even if otherOp is on stack.
            TryToSetRegOptional(otherOp);
        }

        return;
    }

    // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
    // or in other backend.
    
    bool hasShortCast = false;
    if (CheckImmedAndMakeContained(tree, op2))
    {
        bool op1CanBeContained  = (op1Type == op2Type);
        if (!op1CanBeContained)
        {
            if (genTypeSize(op1Type) == genTypeSize(op2Type))          
            {
                // The constant is of the correct size, but we don't have an exact type match
                // We can treat the isMemoryOp as "contained"
                op1CanBeContained = true;
            }
        }

        // Do we have a short compare against a constant in op2
        //
        if (varTypeIsSmall(op1Type))
        {
            GenTreeIntCon* con  = op2->AsIntCon();
            ssize_t        ival = con->gtIconVal;

            bool    isEqualityCompare = (tree->gtOper == GT_EQ || tree->gtOper == GT_NE);
            bool    useTest           = isEqualityCompare && (ival == 0);

            if (!useTest)
            {
                ssize_t lo = 0;  // minimum imm value allowed for cmp reg,imm
                ssize_t hi = 0;  // maximum imm value allowed for cmp reg,imm
                bool    isUnsigned = false;

                switch (op1Type) {
                case TYP_BOOL:
                    op1Type = TYP_UBYTE;
                    __fallthrough;
                case TYP_UBYTE:
                    lo = 0;
                    hi = 0x7f;
                    isUnsigned = true;
                    break;
                case TYP_BYTE:
                    lo = -0x80;
                    hi =  0x7f;
                    break;
                case TYP_CHAR:
                    lo = 0;
                    hi = 0x7fff;
                    isUnsigned = true;
                    break;
                case TYP_SHORT:
                    lo = -0x8000;
                    hi =  0x7fff;
                    break;
                default:
                    unreached();
                }

                if ((ival >= lo) && (ival <= hi))
                {
                    // We can perform a small compare with the immediate 'ival'
                    tree->gtFlags |= GTF_RELOP_SMALL;
                    if (isUnsigned && !isEqualityCompare)
                    {
                        tree->gtFlags |= GTF_UNSIGNED;
                    }
                    // We can treat the isMemoryOp as "contained"
                    op1CanBeContained = true;
                }
            }
        }
        if (op1CanBeContained)
        {
            if (op1->isMemoryOp())
            {
                MakeSrcContained(tree, op1);
            }
            else 
            {
                // When op1 is a GT_AND we can often generate a single "test" instruction
                // instead of two instructions (an "and" instruction followed by a "cmp"/"test")
                //
                // This instruction can only be used for equality or inequality comparions.
                // and we must have a compare against zero.
                //
                // If we have a postive test for a single bit we can reverse the condition and 
                // make the compare be against zero
                //
                // Example:
                //                  GT_EQ                              GT_NE
                //                  /   \                              /   \
                //             GT_AND   GT_CNS (0x100)  ==>>      GT_AND   GT_CNS (0)
                //             /    \                             /    \
                //          andOp1  GT_CNS (0x100)             andOp1  GT_CNS (0x100)
                //
                // We will mark the GT_AND node as contained if the tree is a equality compare with zero
                // Additionally when we do this we also allow for a contained memory operand for "andOp1".
                //
                bool isEqualityCompare = (tree->gtOper == GT_EQ || tree->gtOper == GT_NE);

                if (isEqualityCompare && (op1->OperGet() == GT_AND))
                {
                    GenTreePtr andOp2 = op1->gtOp.gtOp2;
                    if (IsContainableImmed(op1, andOp2))
                    {
                        ssize_t andOp2CnsVal = andOp2->AsIntConCommon()->IconValue();
                        ssize_t relOp2CnsVal = op2->AsIntConCommon()->IconValue();

                        if ((relOp2CnsVal == andOp2CnsVal) && isPow2(andOp2CnsVal))
                        {
                            // We have a single bit test, so now we can change the
                            // tree into the alternative form, 
                            // so that we can generate a test instruction.

                            // Reverse the equality comparison
                            tree->gtOper = (tree->gtOper == GT_EQ) ? GT_NE : GT_EQ;

                            // Change the relOp2CnsVal to zero
                            relOp2CnsVal = 0;
                            op2->AsIntConCommon()->SetIconValue(0);
                        }

                        // Now do we have a equality compare with zero?
                        //
                        if (relOp2CnsVal == 0)
                        {
                            // Note that child nodes must be made contained before parent nodes

                            // Check for a memory operand for op1 with the test instruction
                            //
                            GenTreePtr andOp1 = op1->gtOp.gtOp1;
                            if (andOp1->isMemoryOp())
                            {
                                // If the type of value memoryOp (andOp1) is not the same as the type of constant (andOp2)
                                // check to see whether it is safe to mark AndOp1 as contained.  For e.g. in the following
                                // case it is not safe to mark andOp1 as contained
                                //    AndOp1 = signed byte and andOp2 is an int constant of value 512.
                                //
                                // If it is safe, we update the type and value of andOp2 to match with andOp1.
                                bool containable = (andOp1->TypeGet() == op1->TypeGet());
                                if (!containable)
                                {
                                    ssize_t newIconVal = 0;

                                    switch (andOp1->TypeGet())
                                    {
                                    default:
                                        break;
                                    case TYP_BYTE:
                                        newIconVal = (signed char)andOp2CnsVal; 
                                        containable = FitsIn<signed char>(andOp2CnsVal);
                                        break;
                                    case TYP_BOOL:
                                    case TYP_UBYTE:
                                        newIconVal = andOp2CnsVal & 0xFF; 
                                        containable = true;
                                        break;
                                    case TYP_SHORT:
                                        newIconVal = (signed short)andOp2CnsVal; 
                                        containable = FitsIn<signed short>(andOp2CnsVal);
                                        break;
                                    case TYP_CHAR:
                                        newIconVal = andOp2CnsVal & 0xFFFF;
                                        containable = true;
                                        break;
                                    case TYP_INT:
                                        newIconVal = (INT32)andOp2CnsVal;
                                        containable = FitsIn<INT32>(andOp2CnsVal);
                                        break;
                                    case TYP_UINT:
                                        newIconVal = andOp2CnsVal & 0xFFFFFFFF;
                                        containable = true;
                                        break;

#ifdef _TARGET_64BIT_
                                    case TYP_LONG:
                                        newIconVal = (INT64)andOp2CnsVal; 
                                        containable = true;
                                        break;
                                    case TYP_ULONG:
                                        newIconVal = (UINT64)andOp2CnsVal; 
                                        containable = true;
                                        break;
#endif //_TARGET_64BIT_
                                    }


                                    if (containable)
                                    {
                                        andOp2->gtType = andOp1->TypeGet();
                                        andOp2->AsIntConCommon()->SetIconValue(newIconVal);
                                    }
                                }

                                // Mark the 'andOp1' memory operand as contained
                                // Note that for equality comparisons we don't need
                                // to deal with any signed or unsigned issues.
                                if (containable)
                                {
                                    MakeSrcContained(op1, andOp1);
                                }
                            }
                            // Mark the 'op1' (the GT_AND) operand as contained
                            MakeSrcContained(tree, op1);

                            // During Codegen we will now generate "test andOp1, andOp2CnsVal"
                        }
                    }
                }
                else if (op1->OperGet() == GT_CAST)
                {
                    //If the op1 is a cast operation, and cast type is one byte sized unsigned type, 
                    //we can directly use the number in register, instead of doing an extra cast step.
                    var_types   dstType       = op1->CastToType();
                    bool        isUnsignedDst = varTypeIsUnsigned(dstType);
                    emitAttr    castSize      = EA_ATTR(genTypeSize(dstType));
                    GenTreePtr  castOp1       = op1->gtOp.gtOp1;
                    genTreeOps  castOp1Oper   = castOp1->OperGet();
                    bool        safeOper      = false;

                    // It is not always safe to change the gtType of 'castOp1' to TYP_UBYTE
                    // For example when 'castOp1Oper' is a GT_RSZ or GT_RSH then we are shifting
                    // bits from the left into the lower bits.  If we change the type to a TYP_UBYTE
                    // we will instead generate a byte sized shift operation:  shr  al, 24
                    // For the following ALU operations is it safe to change the gtType to the
                    // smaller type:   
                    //
                    if ((castOp1Oper == GT_CNS_INT) || 
                        (castOp1Oper == GT_CALL)    ||    // the return value from a Call
                        (castOp1Oper == GT_LCL_VAR) ||
                        castOp1->OperIsLogical()    ||    // GT_AND, GT_OR, GT_XOR
                        castOp1->isMemoryOp()         )   // isIndir() || isLclField();
                    {
                        safeOper = true;
                    }

                    if ((castSize == EA_1BYTE) && isUnsignedDst &&    // Unsigned cast to TYP_UBYTE
                        safeOper &&                                   // Must be a safe operation
                        !op1->gtOverflow()                         )  // Must not be an overflow checking cast
                    {
                        // Currently all of the Oper accepted as 'safeOper' are 
                        // non-overflow checking operations.  If we were to add 
                        // an overflow checking operation then this assert needs 
                        // to be moved above to guard entry to this block.
                        // 
                        assert(!castOp1->gtOverflowEx());             // Must not be an overflow checking operation
                        
                        GenTreePtr removeTreeNode = op1;
                        GenTreePtr removeTreeNodeChild = castOp1;
                        tree->gtOp.gtOp1 = castOp1;
                        castOp1->gtType = TYP_UBYTE;

                        // trim down the value if castOp1 is an int constant since its type changed to UBYTE.
                        if (castOp1Oper == GT_CNS_INT)
                        {                            
                            castOp1->gtIntCon.gtIconVal = (UINT8)castOp1->gtIntCon.gtIconVal;
                        }

                        if (op2->isContainedIntOrIImmed())
                        {
                            ssize_t val = (ssize_t)op2->AsIntConCommon()->IconValue();
                            if (val >= 0 && val <= 255)
                            {
                                op2->gtType    = TYP_UBYTE;
                                tree->gtFlags |= GTF_UNSIGNED;
                                
                                //right now the op1's type is the same as op2's type.
                                //if op1 is MemoryOp, we should make the op1 as contained node.
                                if (castOp1->isMemoryOp())
                                {
                                    MakeSrcContained(tree, op1);
                                }
                            }
                        }
                        comp->fgSnipNode(comp->compCurStmt->AsStmt(), removeTreeNode);
#ifdef DEBUG
                        if (comp->verbose)
                        {
                            printf("LowerCmp: Removing a GT_CAST to TYP_UBYTE and changing castOp1->gtType to TYP_UBYTE\n");
                            comp->gtDispTree(tree);
                        }
#endif
                    }
                }
            }
        }
    }
	else if (op1Type == op2Type)
	{
		if (op2->isMemoryOp())
		{
			MakeSrcContained(tree, op2);
		}
		else if (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))
		{
			MakeSrcContained(tree, op1);
		}
        else
        {
            // One of op1 or op2 could be marked as reg optional
            // to indicate that codgen can still generate code 
            // if one of them is on stack.
            TryToSetRegOptional(op2);

            if (!op2->IsRegOptional())
            {
                TryToSetRegOptional(op1);
            }
        }

		if (varTypeIsSmall(op1Type) && varTypeIsUnsigned(op1Type))
		{
			// Mark the tree as doing unsigned comparison if
			// both the operands are small and unsigned types.
			// Otherwise we will end up performing a signed comparison
			// of two small unsigned values without zero extending them to
			// TYP_INT size and which is incorrect.
			tree->gtFlags |= GTF_UNSIGNED;
		}
	}
}

/* Lower GT_CAST(srcType, DstType) nodes. 
 *
 * Casts from small int type to float/double are transformed as follows:
 * GT_CAST(byte, float/double)     =   GT_CAST(GT_CAST(byte, int32), float/double)
 * GT_CAST(sbyte, float/double)    =   GT_CAST(GT_CAST(sbyte, int32), float/double)
 * GT_CAST(int16, float/double)    =   GT_CAST(GT_CAST(int16, int32), float/double)
 * GT_CAST(uint16, float/double)   =   GT_CAST(GT_CAST(uint16, int32), float/double)
 *
 * SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64 
 * are morphed as follows by front-end and hence should not be seen here.
 * GT_CAST(uint32, float/double)   =   GT_CAST(GT_CAST(uint32, long), float/double)
 * GT_CAST(uint64, float)          =   GT_CAST(GT_CAST(uint64, double), float)
 *
 *
 * Similarly casts from float/double to a smaller int type are transformed as follows:
 * GT_CAST(float/double, byte)     =   GT_CAST(GT_CAST(float/double, int32), byte)
 * GT_CAST(float/double, sbyte)    =   GT_CAST(GT_CAST(float/double, int32), sbyte)
 * GT_CAST(float/double, int16)    =   GT_CAST(GT_CAST(double/double, int32), int16)
 * GT_CAST(float/double, uint16)   =   GT_CAST(GT_CAST(double/double, int32), uint16)
 *
 * SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
 * integer.  The above transformations help us to leverage those instructions.
 * 
 * Note that for the following conversions we still depend on helper calls and
 * don't expect to see them here. 
 *  i) GT_CAST(float/double, uint64)
 * ii) GT_CAST(float/double, int type with overflow detection) 
 *
 * TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
 * There are hardly any occurrences of this conversion operation in platform
 * assemblies or in CQ perf benchmarks (1 occurrence in mscorlib, microsoft.jscript,
 * 1 occurence in Roslyn and no occurrences in system, system.core, system.numerics
 * system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
 * doing this optimization is a win, should consider generating in-lined code.
 */
void Lowering::LowerCast( GenTreePtr* ppTree) 
{
    GenTreePtr  tree    = *ppTree;
    assert(tree->OperGet() == GT_CAST);

    GenTreePtr  op1     = tree->gtOp.gtOp1;
    var_types   dstType = tree->CastToType();
    var_types   srcType = op1->TypeGet();
    var_types   tmpType = TYP_UNDEF;
    bool        srcUns  = false;

    // force the srcType to unsigned if GT_UNSIGNED flag is set
    if (tree->gtFlags & GTF_UNSIGNED)
    {
        srcType = genUnsignedType(srcType);
    }

    // We should never see the following casts as they are expected to be lowered 
    // apropriately or converted into helper calls by front-end.
    //   srcType = float/double                    dstType = * and overflow detecting cast
    //       Reason: must be converted to a helper call
    //   srcType = float/double,                   dstType = ulong
    //       Reason: must be converted to a helper call
    //   srcType = uint                            dstType = float/double
    //       Reason: uint -> float/double = uint -> long -> float/double
    //   srcType = ulong                           dstType = float
    //       Reason: ulong -> float = ulong -> double -> float
    if (varTypeIsFloating(srcType))
    {
        noway_assert(!tree->gtOverflow());
        noway_assert(dstType != TYP_ULONG);
    }
    else if (srcType == TYP_UINT)
    {
        noway_assert(!varTypeIsFloating(dstType));
    }
    else if (srcType == TYP_ULONG)
    {
        noway_assert(dstType != TYP_FLOAT);
    }

    // Case of src is a small type and dst is a floating point type.
    if (varTypeIsSmall(srcType) && varTypeIsFloating(dstType))
    {
        // These conversions can never be overflow detecting ones.
        noway_assert(!tree->gtOverflow());
        tmpType = TYP_INT;
    }
    // case of src is a floating point type and dst is a small type.
    else if (varTypeIsFloating(srcType) && varTypeIsSmall(dstType))
    {
        tmpType = TYP_INT;
    }

    if (tmpType != TYP_UNDEF)
    {
        GenTreePtr tmp = comp->gtNewCastNode(tmpType, op1, tmpType);
        tmp->gtFlags |= (tree->gtFlags & (GTF_UNSIGNED|GTF_OVERFLOW|GTF_EXCEPT));

        tree->gtFlags &= ~GTF_UNSIGNED;
        tree->gtOp.gtOp1 = tmp;
        op1->InsertAfterSelf(tmp);
    }
}

 //----------------------------------------------------------------------------------------------
 // Returns true if this tree is bin-op of a GT_STOREIND of the following form
 //      storeInd(subTreeA, binOp(gtInd(subTreeA), subtreeB)) or
 //      storeInd(subTreeA, binOp(subtreeB, gtInd(subTreeA)) in case of commutative bin-ops
 // 
 // The above form for storeInd represents a read-modify-write memory binary operation.
 //
 // Parameters
 //     tree   -   GentreePtr of binOp
 //
 // Return Value 
 //     True if 'tree' is part of a RMW memory operation pattern
 //
bool Lowering::IsBinOpInRMWStoreInd(GenTreePtr tree)
{
    // Must be a non floating-point type binary operator since SSE2 doesn't support RMW memory ops
    assert(!varTypeIsFloating(tree));
    assert(GenTree::OperIsBinary(tree->OperGet()));

    // Cheap bail out check before more expensive checks are performed.
    // RMW memory op pattern requires that one of the operands of binOp to be GT_IND.
    if (tree->gtGetOp1()->OperGet() != GT_IND && tree->gtGetOp2()->OperGet() != GT_IND)
    {
        return false;
    }

    GenTreePtr parent = tree->gtGetParent(nullptr);
    if (parent == nullptr || parent->OperGet() != GT_STOREIND || parent->gtGetOp2() != tree)
    {
        return false;
    }

    // Since it is not relatively cheap to recognize RMW memory op pattern, we
    // cache the result in GT_STOREIND node so that while lowering GT_STOREIND
    // we can use the result.
    GenTreePtr indirCandidate = nullptr;
    GenTreePtr indirOpSource = nullptr;
    return IsRMWMemOpRootedAtStoreInd(parent, &indirCandidate, &indirOpSource);
}
 
 //----------------------------------------------------------------------------------------------
 // This method recognizes the case where we have a treeNode with the following structure:
 //         storeInd(IndirDst, binOp(gtInd(IndirDst), indirOpSource)) OR
 //         storeInd(IndirDst, binOp(indirOpSource, gtInd(IndirDst)) in case of commutative operations OR
 //         storeInd(IndirDst, unaryOp(gtInd(IndirDst)) in case of unary operations
 //
 // Terminology:
 //         indirDst = memory write of an addr mode  (i.e. storeind destination)
 //         indirSrc = value being written to memory (i.e. storeind source which could either be a binary or unary op)
 //         indirCandidate = memory read i.e. a gtInd of an addr mode
 //         indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
 //
 // In x86/x64 this storeInd pattern can be effectively encoded in a single instruction of the
 // following form in case of integer operations:
 //         binOp [addressing mode], RegIndirOpSource
 //         binOp [addressing mode], immediateVal
 // where RegIndirOpSource is the register where indirOpSource was computed.
 //
 // Right now, we recognize few cases:
 //     a) The gtInd child is a lea/lclVar/lclVarAddr/clsVarAddr/constant
 //     b) BinOp is either add, sub, xor, or, and, shl, rsh, rsz.
 //     c) unaryOp is either not/neg
 //
 // Implementation Note: The following routines need to be in sync for RMW memory op optimization
 // to be correct and functional.
 //     IndirsAreEquivalent()
 //     NodesAreEquivalentLeaves()
 //     Codegen of GT_STOREIND and genCodeForShiftRMW()
 //     emitInsRMW()
 //     
 //  TODO-CQ: Enable support for more complex indirections (if needed) or use the value numbering
 //  package to perform more complex tree recognition.
 //
 //  TODO-XArch-CQ: Add support for RMW of lcl fields (e.g. lclfield binop= source)
 //
 //  Parameters:
 //     tree               -  GT_STOREIND node
 //     outIndirCandidate  -  out param set to indirCandidate as described above
 //     ouutIndirOpSource  -  out param set to indirOpSource as described above
 //
 //  Return value
 //     True if there is a RMW memory operation rooted at a GT_STOREIND tree
 //     and out params indirCandidate and indirOpSource are set to non-null values.
 //     Otherwise, returns false with indirCandidate and indirOpSource set to null.
 //     Also updates flags of GT_STOREIND tree with its RMW status.
 //
bool Lowering::IsRMWMemOpRootedAtStoreInd(GenTreePtr tree, GenTreePtr *outIndirCandidate, GenTreePtr *outIndirOpSource)
{
    assert(!varTypeIsFloating(tree));
    assert(outIndirCandidate != nullptr);
    assert(outIndirOpSource != nullptr);
     
    *outIndirCandidate = nullptr;
    *outIndirOpSource = nullptr;

    // Early out if storeInd is already known to be a non-RMW memory op
    GenTreeStoreInd* storeInd = tree->AsStoreInd();
    if (storeInd->IsNonRMWMemoryOp())
    {
        return false;
    }

    GenTreePtr indirDst = storeInd->gtGetOp1();
    GenTreePtr indirSrc = storeInd->gtGetOp2();
    genTreeOps oper = indirSrc->OperGet();

    // Early out if it is already known to be a RMW memory op
    if (storeInd->IsRMWMemoryOp())
    {
        if (GenTree::OperIsBinary(oper))
        {
            if (storeInd->IsRMWDstOp1())
            {
                *outIndirCandidate = indirSrc->gtGetOp1();
                *outIndirOpSource = indirSrc->gtGetOp2();
            }
            else
            {
                assert(storeInd->IsRMWDstOp2());
                *outIndirCandidate = indirSrc->gtGetOp2();
                *outIndirOpSource = indirSrc->gtGetOp1();
            }
            assert(IndirsAreEquivalent(*outIndirCandidate, storeInd));
        }
        else
        {
            assert(GenTree::OperIsUnary(oper));
            assert(IndirsAreEquivalent(indirSrc->gtGetOp1(), storeInd));
            *outIndirCandidate = indirSrc->gtGetOp1();
            *outIndirOpSource = indirSrc->gtGetOp1();
        }

        return true;
    }

    // If reached here means that we do not know RMW status of tree rooted at storeInd
    assert(storeInd->IsRMWStatusUnknown());

    // Early out if indirDst is not one of the supported memory operands.
    if (indirDst->OperGet() != GT_LEA &&
        indirDst->OperGet() != GT_LCL_VAR &&
        indirDst->OperGet() != GT_LCL_VAR_ADDR &&
        indirDst->OperGet() != GT_CLS_VAR_ADDR &&
        indirDst->OperGet() != GT_CNS_INT)
    {
        storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
        return false;
    }
    
    // We can not use Read-Modify-Write instruction forms with overflow checking instructions
    // because we are not allowed to modify the target until after the overflow check. 
    if (indirSrc->gtOverflowEx())
    {
        storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
        return false;
    }

    if (GenTree::OperIsBinary(oper))
    {
        // Return if binary op is not one of the supported operations for RMW of memory.
        if (oper != GT_ADD &&
            oper != GT_SUB &&
            oper != GT_AND &&
            oper != GT_OR  &&
            oper != GT_XOR &&
            !GenTree::OperIsShiftOrRotate(oper))
        {
            storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
            return false;
        }

        if (GenTree::OperIsShiftOrRotate(oper) &&
            varTypeIsSmall(storeInd))
        {
            // In ldind, Integer values smaller than 4 bytes, a boolean, or a character converted to 4 bytes
            // by sign or zero-extension as appropriate. If we directly shift the short type data using sar, we
            // will lose the sign or zero-extension bits.
            storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_TYPE);
            return false;
        }

        GenTreePtr rhsLeft = indirSrc->gtGetOp1();
        GenTreePtr rhsRight = indirSrc->gtGetOp2();

        // The most common case is rhsRight is GT_IND
        if (GenTree::OperIsCommutative(oper) &&
            rhsRight->OperGet() == GT_IND &&
            rhsRight->gtGetOp1()->OperGet() == indirDst->OperGet() &&
            IndirsAreEquivalent(rhsRight, storeInd))
        {
            *outIndirCandidate = rhsRight;
            *outIndirOpSource = rhsLeft;
            storeInd->SetRMWStatus(STOREIND_RMW_DST_IS_OP2);
            return true;
        }
        else if (rhsLeft->OperGet() == GT_IND &&
                 rhsLeft->gtGetOp1()->OperGet() == indirDst->OperGet() &&
                 IsSafeToContainMem(indirSrc, rhsLeft) &&
                 IndirsAreEquivalent(rhsLeft, storeInd))
        {
            *outIndirCandidate = rhsLeft;
            *outIndirOpSource = rhsRight;
            storeInd->SetRMWStatus(STOREIND_RMW_DST_IS_OP1);
            return true;
        }

        storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
        return false;
    }
    else if (GenTree::OperIsUnary(oper))
    {
        // Nodes other than GT_NOT and GT_NEG are not yet supported.    
        if (oper != GT_NOT && oper != GT_NEG)
        {
            storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
            return false;
        }

        if (indirSrc->gtGetOp1()->OperGet() != GT_IND)
        {
            storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_ADDR);
            return false;
        }

        GenTreePtr indirCandidate = indirSrc->gtGetOp1();
        if (indirCandidate->gtGetOp1()->OperGet() == indirDst->OperGet() &&
            IndirsAreEquivalent(indirCandidate, storeInd))
        {
            // src and dest are the same in case of unary ops
            *outIndirCandidate = indirCandidate;
            *outIndirOpSource = indirCandidate; 
            storeInd->SetRMWStatus(STOREIND_RMW_DST_IS_OP1);
            return true;
        }
    }

    assert(*outIndirCandidate == nullptr);
    assert(*outIndirOpSource == nullptr);
    storeInd->SetRMWStatus(STOREIND_RMW_UNSUPPORTED_OPER);
    return false;
}

 //--------------------------------------------------------------------------------------------
 // SetStoreIndOpCountsIfRMWMemOp checks to see if there is a RMW memory operation rooted at
 // GT_STOREIND node and if so will mark register requirements for nodes under storeInd so
 // that CodeGen will generate a single instruction of the form:
 //
 //         binOp [addressing mode], reg
 //
 // Parameters
 //         storeInd   - GT_STOREIND node
 //
 // Return value
 //         True, if RMW memory op tree pattern is recognized and op counts are set.
 //         False otherwise.
 //
bool Lowering::SetStoreIndOpCountsIfRMWMemOp(GenTreePtr storeInd)
{
    assert(storeInd->OperGet() == GT_STOREIND);

    // SSE2 doesn't support RMW on float values
    assert(!varTypeIsFloating(storeInd));

    // Terminology:
    // indirDst = memory write of an addr mode  (i.e. storeind destination)
    // indirSrc = value being written to memory (i.e. storeind source which could a binary/unary op)
    // indirCandidate = memory read i.e. a gtInd of an addr mode
    // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)

    GenTreePtr indirCandidate = nullptr;
    GenTreePtr indirOpSource = nullptr;

    if (!IsRMWMemOpRootedAtStoreInd(storeInd, &indirCandidate, &indirOpSource))
    {
        JITDUMP("Lower of StoreInd didn't mark the node as self contained for reason: %d\n", storeInd->AsStoreInd()->GetRMWStatus());
        DISPTREE(storeInd);
        return false;
    }

    GenTreePtr indirDst = storeInd->gtGetOp1();
    GenTreePtr indirSrc = storeInd->gtGetOp2();
    genTreeOps oper = indirSrc->OperGet();

    // At this point we have successfully detected a RMW memory op of one of the following forms
    //         storeInd(indirDst, indirSrc(indirCandidate, indirOpSource)) OR
    //         storeInd(indirDst, indirSrc(indirOpSource, indirCandidate) in case of commutative operations OR
    //         storeInd(indirDst, indirSrc(indirCandidate) in case of unary operations
    //
    // Here indirSrc = one of the supported binary or unary operation for RMW of memory
    //      indirCandidate = a GT_IND node
    //      indirCandidateChild = operand of GT_IND indirCandidate
    //
    // The logic below essentially does the following
    //      set storeInd src count to that of the dst count of indirOpSource
    //      clear operand counts on indirSrc  (i.e. marked as contained and storeInd will generate code for it)
    //      clear operand counts on indirCandidate
    //      clear operand counts on indirDst except when it is a GT_LCL_VAR or GT_CNS_INT that doesn't fit within addr base
    //      Increment src count of storeInd to account for the registers required to form indirDst addr mode
    //      clear operand counts on indirCandidateChild

    TreeNodeInfo* info = &(storeInd->gtLsraInfo);
    info->dstCount = 0;

    if (GenTree::OperIsBinary(oper))
    {
        // On Xarch RMW operations require that the source memory-op be in a register.
        assert(!indirOpSource->isMemoryOp() || indirOpSource->gtLsraInfo.dstCount == 1);
        JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n");
        info->srcCount = indirOpSource->gtLsraInfo.dstCount;
    } 
    else 
    {
        assert(GenTree::OperIsUnary(oper));
        JITDUMP("Lower succesfully detected an assignment of the form: *addrMode = UnaryOp(*addrMode)\n");
        info->srcCount = 0;
    }
    DISPTREE(storeInd);
    
    m_lsra->clearOperandCounts(indirSrc);
    m_lsra->clearOperandCounts(indirCandidate);

    GenTreePtr indirCandidateChild = indirCandidate->gtGetOp1();
    if (indirCandidateChild->OperGet() == GT_LEA)
    {
        GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode();

        if (addrMode->HasBase())
        {
            assert(addrMode->Base()->OperIsLeaf());
            m_lsra->clearOperandCounts(addrMode->Base());
            info->srcCount++;
        }

        if (addrMode->HasIndex())
        {
            assert(addrMode->Index()->OperIsLeaf());
            m_lsra->clearOperandCounts(addrMode->Index());
            info->srcCount++;
        }

        m_lsra->clearOperandCounts(indirDst);
    }
    else
    {
        assert(indirCandidateChild->OperGet() == GT_LCL_VAR || 
               indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR ||
               indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR ||
               indirCandidateChild->OperGet() == GT_CNS_INT);                         

        // If it is a GT_LCL_VAR, it still needs the reg to hold the address. 
        // We would still need a reg for GT_CNS_INT if it doesn't fit within addressing mode base.
        // For GT_CLS_VAR_ADDR, we don't need a reg to hold the address, because field address value is known at jit time.
        // Also, we don't need a reg for GT_CLS_VAR_ADDR.
        if (indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR || indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR)
        {
            m_lsra->clearOperandCounts(indirDst);
        }
        else if (indirCandidateChild->IsCnsIntOrI() &&
                 indirCandidateChild->AsIntConCommon()->FitsInAddrBase(comp))
        {
            m_lsra->clearOperandCounts(indirDst);
        }
        else
        {
            // Need a reg and hence increment src count of storeind
            info->srcCount += indirCandidateChild->gtLsraInfo.dstCount;
        }
    }
    m_lsra->clearOperandCounts(indirCandidateChild);

    return true;
}

/**
 * Takes care of annotating the src and dst register 
 * requirements for a GT_MUL treenode.
 */
void Lowering::SetMulOpCounts(GenTreePtr tree)
{
    assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI);

    TreeNodeInfo* info = &(tree->gtLsraInfo);

    info->srcCount = 2;
    info->dstCount = 1;

    GenTreePtr op1 = tree->gtOp.gtOp1;
    GenTreePtr op2 = tree->gtOp.gtOp2;

    // Case of float/double mul.
    if (varTypeIsFloating(tree->TypeGet()))
    {
        assert(tree->OperGet() == GT_MUL);

        if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
        {
            MakeSrcContained(tree, op2);
        }
        else if (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1)))
        {
            // Since  GT_MUL is commutative, we will try to re-order operands if it is safe to
            // generate more efficient code sequence for the case of GT_MUL(op1=memOp, op2=non-memOp)
            MakeSrcContained(tree, op1);
        }
        else
        {
            // If there are no containable operands, we can make an operand reg optional.
            SetRegOptionalForBinOp(tree);
        }
        return;
    }
    
    bool isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
    bool requiresOverflowCheck = tree->gtOverflowEx();
    bool useLeaEncoding = false;
    GenTreePtr memOp = nullptr;

    // There are three forms of x86 multiply:
    // one-op form:     RDX:RAX = RAX * r/m
    // two-op form:     reg *= r/m
    // three-op form:   reg = r/m * imm

    // This special widening 32x32->64 MUL is not used on x64
    assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);

    // Multiply should never be using small types
    assert(!varTypeIsSmall(tree->TypeGet()));

    // We do use the widening multiply to implement 
    // the overflow checking for unsigned multiply
    // 
    if (isUnsignedMultiply && requiresOverflowCheck)
    {
        // The only encoding provided is RDX:RAX = RAX * rm
        // 
        // Here we set RAX as the only destination candidate 
        // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
        //
        info->setDstCandidates(m_lsra,RBM_RAX);
    }
    else if (tree->gtOper == GT_MULHI)
    {
        // have to use the encoding:RDX:RAX = RAX * rm
        info->setDstCandidates(m_lsra, RBM_RAX);
    }
    else if (IsContainableImmed(tree, op2) || IsContainableImmed(tree, op1))
    {
        GenTreeIntConCommon* imm;
        GenTreePtr other;

        if (IsContainableImmed(tree, op2))
        { 
            imm = op2->AsIntConCommon();
            other = op1; 
        }
        else
        { 
            imm = op1->AsIntConCommon();
            other = op2; 
        }

        // CQ: We want to rewrite this into a LEA
        ssize_t immVal = imm->AsIntConCommon()->IconValue();
        if (!requiresOverflowCheck && (immVal == 3 || immVal == 5 || immVal == 9))
        {
            useLeaEncoding = true;
        }

        MakeSrcContained(tree, imm);   // The imm is always contained
        if (other->isIndir())
        {
            memOp = other;             // memOp may be contained below
        }
    }
    // We allow one operand to be a contained memory operand.
    // The memory op type must match with the 'tree' type.
    // This is because during codegen we use 'tree' type to derive EmitTypeSize.
    // E.g op1 type = byte, op2 type = byte but GT_MUL tree type is int.
    //
    if (memOp == nullptr && op2->isMemoryOp())
    {
        memOp = op2;
    }

    // To generate an LEA we need to force memOp into a register
    // so don't allow memOp to be 'contained'
    //
    if (!useLeaEncoding)
    {
        if (memOp != nullptr)
        {
            if ((memOp->TypeGet() == tree->TypeGet()) &&
                IsSafeToContainMem(tree, memOp))
            {
                MakeSrcContained(tree, memOp);
            }
        }
        else
        {
            // If there are no containable operands, we can make an operand reg optional.
            SetRegOptionalForBinOp(tree);
        }
    }
}

//------------------------------------------------------------------------------
// isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
//
// Arguments:
//    tree      - a binary tree node
//
// Return Value:
//    Returns true if we can use the read-modify-write instruction form
//
// Notes:
//    This is used to determine whether to preference the source to the destination register.
//
bool Lowering::isRMWRegOper(GenTreePtr tree)
{
    // TODO-XArch-CQ: Make this more accurate.
    // For now, We assume that most binary operators are of the RMW form.
    assert(tree->OperIsBinary());

    if (tree->OperIsCompare())
    {
        return false;
    }

    // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand 
    if ((tree->OperGet() == GT_LEA) || (tree->OperGet() == GT_STOREIND) || (tree->OperGet() == GT_ARR_INDEX))
        return false;

    // x86/x64 does support a three op multiply when op2|op1 is a contained immediate
    if ((tree->OperGet() == GT_MUL) &&
        (Lowering::IsContainableImmed(tree, tree->gtOp.gtOp2) ||
        Lowering::IsContainableImmed(tree, tree->gtOp.gtOp1)))
    {
        return false;
    }

    // otherwise we return true.
    return true;
}

// anything is in range for AMD64
bool Lowering::IsCallTargetInRange(void* addr)
{
    return true;
}

// return true if the immediate can be folded into an instruction, for example small enough and non-relocatable
bool Lowering:: IsContainableImmed(GenTree* parentNode, GenTree* childNode)
{
    if (!childNode->IsIntCnsFitsInI32())
    {
        return false;
    }

    // At this point we know that it is an int const fits within 4-bytes and hence can safely cast to IntConCommon.
    // Icons that need relocation should never be marked as contained immed
    if (childNode->AsIntConCommon()->ImmedValNeedsReloc(comp))
    {
        return false;
    }

    return true;
}

//----------------------------------------------------------------------
// TryToSetRegOptional - sets a bit to indicate to LSRA that register
// for a given tree node is optional for codegen purpose.  If no
// register is allocated to such a tree node, its parent node treats
// it as a contained memory operand during codegen.
//
// Arguments:
//    tree    -   GenTree node
//
// Returns
//    None
//
// Note: Right now a tree node is marked as reg optional only
// if is it a GT_LCL_VAR.  This routine needs to be modified if
// in future if lower/codegen needs to support other tree node
// types.
void Lowering::TryToSetRegOptional(GenTree* tree)
{
    if (tree->OperGet() == GT_LCL_VAR)
    {
        tree->gtLsraInfo.regOptional = true;
    }
}

// ------------------------------------------------------------------
// SetRegOptionalBinOp - Indicates which of the operands of a bin-op
// register requirement is optional. Xarch instruction set allows
// either of op1 or op2 of binary operation (e.g. add, mul etc) to be
// a memory operand.  This routine provides info to register allocator
// which of its operands optionally require a register.  Lsra might not
// allocate a register to RefTypeUse positions of such operands if it
// is beneficial. In such a case codegen will treat them as memory
// operands.
//
// Arguments:
//     tree  -  Gentree of a bininary operation.
//
// Returns 
//     None.
// 
// Note: On xarch at most only one of the operands will be marked as
// reg optional, even when both operands could be considered register
// optional.
void Lowering::SetRegOptionalForBinOp(GenTree* tree)
{
    assert(GenTree::OperIsBinary(tree->OperGet()));

    GenTree* op1 = tree->gtGetOp1();
    GenTree* op2 = tree->gtGetOp2();

    if (tree->TypeGet() == op2->TypeGet())
    {
        TryToSetRegOptional(op2);
    }

    if (!op2->IsRegOptional() &&
        tree->OperIsCommutative() &&
        tree->TypeGet() == op1->TypeGet())
    {
        TryToSetRegOptional(op1);
    }
}

#endif // _TARGET_XARCH_

#endif // !LEGACY_BACKEND