//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
//

/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XX                                                                           XX
XX                           Lowering for AMD64                              XX
XX                                                                           XX
XX  This encapsulates all the logic for lowering trees for the AMD64         XX
XX  architecture.  For a more detailed view of what is lowering, please      XX
XX  take a look at Lower.cpp                                                 XX
XX                                                                           XX
XX                                                                           XX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
*/

#include "jitpch.h"
#ifdef _MSC_VER
#pragma hdrstop
#endif

#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator

#ifdef _TARGET_XARCH_

#include "jit.h"
#include "lower.h"

// there is not much lowering to do with storing a local but 
// we do some handling of contained immediates and widening operations of unsigneds
void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
{
    TreeNodeInfo* info = &(storeLoc->gtLsraInfo);

#ifdef FEATURE_SIMD    
    if (storeLoc->TypeGet() == TYP_SIMD12)
    {
        // Need an additional register to extract upper 4 bytes of Vector3.
        info->internalFloatCount = 1;
        info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());

        // In this case don't mark the operand as contained as we want it to
        // be evaluated into an xmm register
        return;
    }
#endif // FEATURE_SIMD

    // If the source is a containable immediate, make it contained, unless it is
    // an int-size or larger store of zero to memory, because we can generate smaller code
    // by zeroing a register and then storing it.
    GenTree* op1 = storeLoc->gtOp1;
    if (IsContainableImmed(storeLoc, op1) && (!op1->IsZero() || varTypeIsSmall(storeLoc)))
    {
        MakeSrcContained(storeLoc, op1);
    }

    // Try to widen the ops if they are going into a local var.
    if ((storeLoc->gtOper == GT_STORE_LCL_VAR) &&
        (storeLoc->gtOp1->gtOper == GT_CNS_INT))
    {
        GenTreeIntCon* con = storeLoc->gtOp1->AsIntCon();
        ssize_t       ival = con->gtIconVal;

        unsigned        varNum = storeLoc->gtLclNum;
        LclVarDsc*      varDsc = comp->lvaTable + varNum;

        // If we are storing a constant into a local variable
        // we extend the size of the store here 
        if (genTypeSize(storeLoc) < 4)
        {
            if (!varTypeIsUnsigned(varDsc))
            {
                if (genTypeSize(storeLoc) == 1)
                {
                    if ((ival & 0x7f) != ival)
                    {
                        ival = ival | 0xffffff00;
                    }
                }
                else
                {
                    assert(genTypeSize(storeLoc) == 2);
                    if ((ival & 0x7fff) != ival)
                    {
                        ival = ival | 0xffff0000;
                    }
                }
            }

            // A local stack slot is at least 4 bytes in size, regardless of
            // what the local var is typed as, so auto-promote it here
            // unless it is a field of a promoted struct
            // TODO-XArch-CQ: if the field is promoted shouldn't we also be able to do this?
            if (!varDsc->lvIsStructField)
            {
                storeLoc->gtType = TYP_INT;
                con->SetIconValue(ival);
            }
        }
    }
}

// TreeNodeInfoInitSimple:
//   Sets the srcCount and dstCount for all the trees without special handling based on the tree node type.
//
// args:
//   tree: The tree on which TreeNodeInfo's srcCount and dstCount are set.
//   info: The TreeNodeInfo on which to set the srcCount and dstCount.
//         This is the TreeNodeInfo corresponding to the tree parameter.
//   kind: The kind flags of the tree node.
//   
void Lowering::TreeNodeInfoInitSimple(GenTree* tree, TreeNodeInfo* info, unsigned kind)
{
    info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
    if (kind & (GTK_CONST | GTK_LEAF))
    {
        info->srcCount = 0;
    }
    else if (kind & (GTK_SMPOP))
    {
        if (tree->gtGetOp2() != nullptr)
        {
            info->srcCount = 2;
        }
        else
        {
            info->srcCount = 1;
        }
    }
    else
    {
        unreached();
    }
}

/**
 * Takes care of annotating the register requirements 
 * for every TreeNodeInfo struct that maps to each tree node.
 * Preconditions:
 *    LSRA Has been initialized and there is a TreeNodeInfo node
 *    already allocated and initialized for every tree in the IR.
 * Postconditions:
 *    Every TreeNodeInfo instance has the right annotations on register
 *    requirements needed by LSRA to build the Interval Table (source, 
 *    destination and internal [temp] register counts).
 *    This code is refactored originally from LSRA.
 */
void Lowering::TreeNodeInfoInit(GenTree* stmt)
{
    LinearScan* l = m_lsra;
    Compiler* compiler = comp;

    assert(stmt->gtStmt.gtStmtIsTopLevel());
    GenTree* tree = stmt->gtStmt.gtStmtList;
    
    while (tree)
    {
        unsigned kind = tree->OperKind();
        TreeNodeInfo* info = &(tree->gtLsraInfo);
        RegisterType registerType = TypeGet(tree);
        GenTree* next = tree->gtNext;

        switch (tree->OperGet())
        {
            GenTree* op1;
            GenTree* op2;

        default:
            TreeNodeInfoInitSimple(tree, info, kind);
            break;

        case GT_LCL_FLD:
            info->srcCount = 0;
            info->dstCount = 1;

#ifdef FEATURE_SIMD
            // Need an additional register to read upper 4 bytes of Vector3.
            if (tree->TypeGet() == TYP_SIMD12)
            {
                // We need an internal register different from targetReg in which 'tree' produces its result
                // because both targetReg and internal reg will be in use at the same time. This is achieved
                // by asking for two internal registers.
                info->internalFloatCount = 2;
                info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
            }
#endif
            break;
            
        case GT_STORE_LCL_FLD:
            info->srcCount = 1;
            info->dstCount = 0;
            LowerStoreLoc(tree->AsLclVarCommon());
            break;

        case GT_STORE_LCL_VAR:
            info->srcCount = 1;
            info->dstCount = 0;
            LowerStoreLoc(tree->AsLclVarCommon());
            break;

        case GT_BOX:
            noway_assert(!"box should not exist here");
            // The result of 'op1' is also the final result
            info->srcCount = 0;
            info->dstCount = 0;
            break;

        case GT_PHYSREGDST:
            info->srcCount = 1;
            info->dstCount = 0;
            break;

        case GT_COMMA:
        {
            GenTreePtr firstOperand;
            GenTreePtr secondOperand;
            if (tree->gtFlags & GTF_REVERSE_OPS)
            {
                firstOperand  = tree->gtOp.gtOp2;
                secondOperand = tree->gtOp.gtOp1;
            }
            else
            {
                firstOperand  = tree->gtOp.gtOp1;
                secondOperand = tree->gtOp.gtOp2;
            }
            if (firstOperand->TypeGet() != TYP_VOID)
            {
                firstOperand->gtLsraInfo.isLocalDefUse = true;
                firstOperand->gtLsraInfo.dstCount = 0;
            }
            if (tree->TypeGet() == TYP_VOID && secondOperand->TypeGet() != TYP_VOID)
            {
                secondOperand->gtLsraInfo.isLocalDefUse = true;
                secondOperand->gtLsraInfo.dstCount = 0;
            }
        }

        __fallthrough;

        case GT_LIST:
        case GT_ARGPLACE:
        case GT_NO_OP:
        case GT_START_NONGC:
        case GT_PROF_HOOK:
            info->srcCount = 0;
            info->dstCount = 0;
            break;

        case GT_CNS_DBL:
            info->srcCount = 0;
            info->dstCount = 1;
            break;

#if !defined(_TARGET_64BIT_)
        case GT_LONG:
            // Passthrough
            info->srcCount = 0;
            info->dstCount = 0;
            break;

#endif // !defined(_TARGET_64BIT_)

        case GT_QMARK:
        case GT_COLON:
            info->srcCount = 0;
            info->dstCount = 0;
            unreached();
            break;

        case GT_RETURN:
#if !defined(_TARGET_64BIT_)
            if (tree->TypeGet() == TYP_LONG)
            {
                GenTree* op1 = tree->gtGetOp1();
                noway_assert(op1->OperGet() == GT_LONG);
                GenTree* loVal = op1->gtGetOp1();
                GenTree* hiVal = op1->gtGetOp2();
                info->srcCount = 2;
                loVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_LO);
                hiVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_HI);
                info->dstCount = 0;
            }
            else
#endif // !defined(_TARGET_64BIT_)
            {
#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
                if (tree->TypeGet() == TYP_STRUCT && 
                    tree->gtOp.gtOp1->OperGet() == GT_LCL_VAR)
                {
#ifdef DEBUG
                    GenTreeLclVarCommon* lclVarPtr = tree->gtOp.gtOp1->AsLclVarCommon();
                    LclVarDsc* varDsc = &(compiler->lvaTable[lclVarPtr->gtLclNum]);
                    assert(varDsc->lvDontPromote);
#endif // DEBUG
                    // If this is a two eightbyte return, make the var
                    // contained by the return expression. The code gen will put
                    // the values in the right registers for return.
                    info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
                    info->dstCount = 0;
                    MakeSrcContained(tree, tree->gtOp.gtOp1);
                    break;
                }
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
                info->dstCount = 0;

                regMaskTP useCandidates;
                switch (tree->TypeGet())
                {
                case TYP_VOID:   useCandidates = RBM_NONE; break;
                case TYP_FLOAT:  useCandidates = RBM_FLOATRET; break;
                case TYP_DOUBLE: useCandidates = RBM_DOUBLERET; break;
#if defined(_TARGET_64BIT_)
                case TYP_LONG:   useCandidates = RBM_LNGRET; break;
#endif // defined(_TARGET_64BIT_)
                default:         useCandidates = RBM_INTRET; break;
                }
                if (useCandidates != RBM_NONE)
                {
                    tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, useCandidates);
                }
            }
            break;

        case GT_RETFILT:
            if (tree->TypeGet() == TYP_VOID)
            {
                info->srcCount = 0;
                info->dstCount = 0;
            }
            else
            {
                assert(tree->TypeGet() == TYP_INT);

                info->srcCount = 1;
                info->dstCount = 1;

                info->setSrcCandidates(l, RBM_INTRET);
                tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET);
            }
            break;

            // A GT_NOP is either a passthrough (if it is void, or if it has
            // a child), but must be considered to produce a dummy value if it
            // has a type but no child
        case GT_NOP:
            info->srcCount = 0;
            if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
            {
                info->dstCount = 1;
            }
            else
            {
                info->dstCount = 0;
            }
            break;

        case GT_JTRUE:
            info->srcCount = 0;
            info->dstCount = 0;
            l->clearDstCount(tree->gtOp.gtOp1);
            break;

        case GT_JMP:
            info->srcCount = 0;
            info->dstCount = 0;
            break;

        case GT_SWITCH:
            // This should never occur since switch nodes must not be visible at this
            // point in the JIT.
            info->srcCount = 0;
            info->dstCount = 0;  // To avoid getting uninit errors.
            noway_assert(!"Switch must be lowered at this point");
            break;

        case GT_JMPTABLE:
            info->srcCount = 0;
            info->dstCount = 1;
            break;

        case GT_SWITCH_TABLE:
            info->srcCount = 2;
            info->internalIntCount = 1;
            info->dstCount = 0;
            break;

        case GT_ASG:
        case GT_ASG_ADD:
        case GT_ASG_SUB:
            noway_assert(!"We should never hit any assignment operator in lowering");
            info->srcCount = 0;
            info->dstCount = 0;
            break;

        case GT_ADD:
        case GT_SUB:
            // SSE2 arithmetic instructions doesn't support the form "op mem, xmm".  
            // Rather they only support "op xmm, mem/xmm" form.
            if (varTypeIsFloating(tree->TypeGet()))
            {
                // overflow operations aren't supported on float/double types.
                assert(!tree->gtOverflow());

                // No implicit conversions at this stage as the expectation is that
                // everything is made explicit by adding casts.
                assert(tree->gtOp.gtOp1->TypeGet() == tree->gtOp.gtOp2->TypeGet());

                info->srcCount = 2;
                info->dstCount = 1;              

                op2 = tree->gtOp.gtOp2; 
                if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
                {
                    MakeSrcContained(tree, op2);
                }
                break;
            }

            __fallthrough;

        case GT_AND:
        case GT_OR:
        case GT_XOR:
        {            
            // We're not marking a constant hanging on the left of the add
            // as containable so we assign it to a register having CQ impact.
            // TODO-XArch-CQ: Detect this case and support both generating a single instruction 
            // for GT_ADD(Constant, SomeTree) and GT_ADD(SomeTree, Constant)
            info->srcCount = 2;
            info->dstCount = 1;
            op2 = tree->gtOp.gtOp2;

            // We can directly encode the second operand if it is either a containable constant or a local field.
            // In case of local field, we can encode it directly provided its type matches with 'tree' type.
            // This is because during codegen, type of 'tree' is used to determine emit Type size. If the types
            // do not match, they get normalized (i.e. sign/zero extended) on load into a register.
            bool directlyEncodable = false;
            if (IsContainableImmed(tree, op2))
            {
                directlyEncodable = true;
            }
            else if ((tree->gtOp.gtOp1->gtOper != GT_IND)  && op2->isLclField() && tree->TypeGet() == op2->TypeGet())
            {
                directlyEncodable = true;
            }

            if (directlyEncodable)
            {
                l->clearDstCount(op2);
                info->srcCount = 1;
            }
        }
        break;
      
        case GT_RETURNTRAP:
            // this just turns into a compare of its child with an int
            // + a conditional call
            info->srcCount = 1;
            info->dstCount = 0;
            if (tree->gtOp.gtOp1->isIndir())
            {
                MakeSrcContained(tree, tree->gtOp.gtOp1);
            }
            info->internalIntCount = 1;
            info->setInternalCandidates(l, l->allRegs(TYP_INT));
            break;

        case GT_MOD:
        case GT_DIV:
            if (varTypeIsFloating(tree->TypeGet()))
            {   
                // No implicit conversions at this stage as the expectation is that
                // everything is made explicit by adding casts.
                assert(tree->gtOp.gtOp1->TypeGet() == tree->gtOp.gtOp2->TypeGet());

                info->srcCount = 2;
                info->dstCount = 1;

                op2 = tree->gtOp.gtOp2;
                if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
                {
                    MakeSrcContained(tree, op2);
                }
                break;
            }
            __fallthrough;

        case GT_UMOD:
        case GT_UDIV:
        {
            info->srcCount = 2;
            info->dstCount = 1;

            op1 = tree->gtOp.gtOp1;
            op2 = tree->gtOp.gtOp2;

            // See if we have an optimizable power of 2 which will be expanded 
            // using instructions other than division.
            // (fgMorph has already done magic number transforms)

            if (op2->IsIntCnsFitsInI32())
            {
                bool isSigned = tree->OperGet() == GT_MOD || tree->OperGet() == GT_DIV;
                ssize_t amount = op2->gtIntConCommon.IconValue();

                if (isPow2(abs(amount)) && (isSigned || amount > 0)
                    && amount != -1)
                {
                    MakeSrcContained(tree, op2);
                    
                    if (isSigned)
                    {
                        // we are going to use CDQ instruction so want these RDX:RAX
                        info->setDstCandidates(l, RBM_RAX);
                        // If possible would like to have op1 in RAX to avoid a register move
                        op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
                    }
                    break;
                }
            }

            // Amd64 Div/Idiv instruction: 
            //    Dividend in RAX:RDX  and computes
            //    Quotient in RAX, Remainder in RDX

            if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
            {
                // We are interested in just the remainder.
                // RAX is used as a trashable register during computation of remainder.
                info->setDstCandidates(l, RBM_RDX);
            }
            else 
            {
                // We are interested in just the quotient.
                // RDX gets used as trashable register during computation of quotient
                info->setDstCandidates(l, RBM_RAX);            
            }

            // If possible would like to have op1 in RAX to avoid a register move
            op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);

            // divisor can be an r/m, but the memory indirection must be of the same size as the divide
            if (op2->isMemoryOp() && (op2->TypeGet() == tree->TypeGet()))
            {
                MakeSrcContained(tree, op2);
            }
            else
            {
                op2->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
            }
        }
        break;

        case GT_MUL:
        case GT_MULHI:
            SetMulOpCounts(tree);
            break;
        
        case GT_MATH:
            {
                // Both operand and its result must be of floating point type.
                op1 = tree->gtOp.gtOp1;
                assert(varTypeIsFloating(op1));
                assert(op1->TypeGet() == tree->TypeGet());

                info->srcCount = 1;
                info->dstCount = 1;

                switch(tree->gtMath.gtMathFN)
                {
                     case CORINFO_INTRINSIC_Sqrt:
                         if (op1->isMemoryOp() || op1->IsCnsNonZeroFltOrDbl())
                         {
                             MakeSrcContained(tree, op1);
                         } 
                         break;

                     case CORINFO_INTRINSIC_Abs:
                         // Abs(float x) = x & 0x7fffffff
                         // Abs(double x) = x & 0x7ffffff ffffffff

                         // In case of Abs we need an internal register to hold mask.
                         
                         // TODO-XArch-CQ: avoid using an internal register for the mask.
                         // Andps or andpd both will operate on 128-bit operands.  
                         // The data section constant to hold the mask is a 64-bit size.
                         // Therefore, we need both the operand and mask to be in 
                         // xmm register. When we add support in emitter to emit 128-bit
                         // data constants and instructions that operate on 128-bit
                         // memory operands we can avoid the need for an internal register.
                         if (tree->gtMath.gtMathFN == CORINFO_INTRINSIC_Abs)
                         {
                             info->internalFloatCount = 1;
                             info->setInternalCandidates(l, l->internalFloatRegCandidates());
                         }
                         break;

#ifdef _TARGET_X86_
                     case CORINFO_INTRINSIC_Cos:                
                     case CORINFO_INTRINSIC_Sin:                
                     case CORINFO_INTRINSIC_Round:
                         NYI_X86("Math intrinsics Cos, Sin and Round");
                         break;
#endif // _TARGET_X86_

                     default:
                         // Right now only Sqrt/Abs are treated as math intrinsics
                         noway_assert(!"Unsupported math intrinsic");
                         unreached();
                         break;
                }
            }
            break;

#ifdef FEATURE_SIMD
        case GT_SIMD:
            TreeNodeInfoInitSIMD(tree, l);
            break;
#endif // FEATURE_SIMD

        case GT_CAST:
            {
                // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
                //         see CodeGen::genIntToIntCast() 

                info->srcCount = 1;
                info->dstCount = 1;

                // Non-overflow casts to/from float/double are done using SSE2 instructions
                // and that allow the source operand to be either a reg or memop. Given the
                // fact that casts from small int to float/double are done as two-level casts, 
                // the source operand is always guaranteed to be of size 4 or 8 bytes.
                var_types castToType = tree->CastToType();
                GenTreePtr castOp    = tree->gtCast.CastOp();
                var_types castOpType = castOp->TypeGet();
                if (tree->gtFlags & GTF_UNSIGNED)
                {
                    castOpType = genUnsignedType(castOpType);
                }

                if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType)))
                {
#ifdef DEBUG
                    // If converting to float/double, the operand must be 4 or 8 byte in size.
                    if (varTypeIsFloating(castToType))
                    {
                        unsigned opSize = genTypeSize(castOpType);
                        assert(opSize == 4 || opSize == 8);
                    }
#endif //DEBUG

                    // U8 -> R8 conversion requires that the operand be in a register.
                    if (castOpType != TYP_ULONG)
                    {
                        if (castOp->isMemoryOp() || castOp->IsCnsNonZeroFltOrDbl())
                        {
                            MakeSrcContained(tree, castOp);
                        }
                    }
                }

#if !defined(_TARGET_64BIT_)
                if (varTypeIsLong(castOpType))
                {
                    noway_assert(castOp->OperGet() == GT_LONG);
                    info->srcCount = 2;
                }
#endif // !defined(_TARGET_64BIT_)

                // some overflow checks need a temp reg:
                //  - GT_CAST from INT64/UINT64 to UINT32
                if (tree->gtOverflow() && (castToType == TYP_UINT))
                {
                    if (genTypeSize(castOpType) == 8)
                    {
                        info->internalIntCount = 1;
                    }
                }
            }
            break;

        case GT_NEG:
            info->srcCount = 1;
            info->dstCount = 1;

            // TODO-XArch-CQ:
            // SSE instruction set doesn't have an instruction to negate a number.
            // The recommended way is to xor the float/double number with a bitmask.
            // The only way to xor is using xorps or xorpd both of which operate on 
            // 128-bit operands.  To hold the bit-mask we would need another xmm
            // register or a 16-byte aligned 128-bit data constant. Right now emitter
            // lacks the support for emitting such constants or instruction with mem
            // addressing mode referring to a 128-bit operand. For now we use an 
            // internal xmm register to load 32/64-bit bitmask from data section.
            // Note that by trading additional data section memory (128-bit) we can
            // save on the need for an internal register and also a memory-to-reg
            // move.
            //
            // Note: another option to avoid internal register requirement is by
            // lowering as GT_SUB(0, src).  This will generate code different from
            // Jit64 and could possibly result in compat issues (?).
            if (varTypeIsFloating(tree))
            {
                info->internalFloatCount = 1;
                info->setInternalCandidates(l, l->internalFloatRegCandidates());
            }
            break;
        
        case GT_NOT:
            info->srcCount = 1;
            info->dstCount = 1;
            break;

        case GT_LSH:
        case GT_RSH:
        case GT_RSZ:
        case GT_ROL:
        case GT_ROR:
        {
            info->srcCount = 2;
            info->dstCount = 1;
            // For shift operations, we need that the number
            // of bits moved gets stored in CL in case 
            // the number of bits to shift is not a constant.
            GenTreePtr shiftBy = tree->gtOp.gtOp2;
            GenTreePtr source = tree->gtOp.gtOp1;

            // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
            // We will allow whatever can be encoded - hope you know what you are doing.
            if (!IsContainableImmed(tree, shiftBy)
                || shiftBy->gtIntConCommon.IconValue() > 255
                || shiftBy->gtIntConCommon.IconValue() < 0)
            {
                source->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
                shiftBy->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
                info->setDstCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
            }
            else
            {
                MakeSrcContained(tree, shiftBy);
            }
        }
        break;

        case GT_EQ:
        case GT_NE:
        case GT_LT:
        case GT_LE:
        case GT_GE:
        case GT_GT:
            LowerCmp(tree);
            break;

        case GT_CKFINITE:
            info->srcCount = 1;
            info->dstCount = 1;
            info->internalIntCount = 1;
            break;

        case GT_CMPXCHG:
            info->srcCount = 3;
            info->dstCount = 1;

            // comparand is preferenced to RAX.
            // Remaining two operands can be in any reg other than RAX.
            tree->gtCmpXchg.gtOpComparand->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
            tree->gtCmpXchg.gtOpLocation->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
            tree->gtCmpXchg.gtOpValue->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
            tree->gtLsraInfo.setDstCandidates(l, RBM_RAX);
            break;

        case GT_LOCKADD:
            info->srcCount = 2;
            info->dstCount = 0;

            CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
            break;

        case GT_CALL:
        {
            info->srcCount = 0;
            info->dstCount =  (tree->TypeGet() != TYP_VOID) ? 1 : 0;

            GenTree *ctrlExpr = tree->gtCall.gtControlExpr;
            if (tree->gtCall.gtCallType == CT_INDIRECT)
            {
                // either gtControlExpr != null or gtCallAddr != null.
                // Both cannot be non-null at the same time.
                assert(ctrlExpr == nullptr);
                assert(tree->gtCall.gtCallAddr != nullptr);
                ctrlExpr = tree->gtCall.gtCallAddr;
            }

            // set reg requirements on call target represented as control sequence.
            if (ctrlExpr != nullptr)
            {
                // we should never see a gtControlExpr whose type is void.
                assert(ctrlExpr->TypeGet() != TYP_VOID);                

                // call can take a Rm op on x64
                info->srcCount++;               

                // In case of fast tail implemented as jmp, make sure that gtControlExpr is
                // computed into a register.
                if (!tree->gtCall.IsFastTailCall())
                {
                    if (ctrlExpr->isIndir())
                    {
                        MakeSrcContained(tree, ctrlExpr);
                    }
                }
                else
                {
                    // Fast tail call - make sure that call target is always computed in RAX
                    // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
                    ctrlExpr->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
                }
            }

            // If this is a varargs call, we will clear the internal candidates in case we need
            // to reserve some integer registers for copying float args.
            // We have to do this because otherwise the default candidates are allRegs, and adding
            // the individual specific registers will have no effect.
            if (tree->gtCall.IsVarargs())
            {
                tree->gtLsraInfo.setInternalCandidates(l, RBM_NONE);
            }

            // Set destination candidates for return value of the call.
            if (varTypeIsFloating(registerType))
            {
#ifdef _TARGET_X86_
                // The return value will be on the X87 stack, and we will need to move it.
                info->setDstCandidates(l, l->allRegs(registerType));
#else // !_TARGET_X86_
                info->setDstCandidates(l, RBM_FLOATRET);
#endif // !_TARGET_X86_
            }
            else if (registerType == TYP_LONG)
            {
                info->setDstCandidates(l, RBM_LNGRET);
            }
            else
            {
                info->setDstCandidates(l, RBM_INTRET);
            }

            // number of args to a call = 
            // callRegArgs + (callargs - placeholders, setup, etc)
            // there is an explicit thisPtr but it is redundant 

            // If there is an explicit this pointer, we don't want that node to produce anything
            // as it is redundant
            if (tree->gtCall.gtCallObjp != nullptr)
            {
                GenTreePtr thisPtrNode = tree->gtCall.gtCallObjp;

                if (thisPtrNode->gtOper == GT_PUTARG_REG)
                {
                    l->clearOperandCounts(thisPtrNode);
                    l->clearDstCount(thisPtrNode->gtOp.gtOp1);
                }
                else
                {
                    l->clearDstCount(thisPtrNode);
                }
            }

            // First, count reg args
#if FEATURE_VARARG
            bool callHasFloatRegArgs = false;
#endif // !FEATURE_VARARG
            
            for (GenTreePtr list = tree->gtCall.gtCallLateArgs; list; list = list->MoveNext())
            {
                assert(list->IsList());

                GenTreePtr argNode = list->Current();

                fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(tree, argNode);
                assert(curArgTabEntry);

                if (curArgTabEntry->regNum == REG_STK)
                {
                    // late arg that is not passed in a register
                    DISPNODE(argNode);
                    assert(argNode->gtOper == GT_PUTARG_STK);
                    argNode->gtLsraInfo.srcCount = 1;
                    argNode->gtLsraInfo.dstCount = 0;

#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
                    // If the node is a struct and it is put on stack with
                    // putarg_stk operation, we consume and produce no registers.
                    // In this case the embedded LdObj node should not produce 
                    // registers too since it is contained.
                    if (argNode->TypeGet() == TYP_STRUCT)
                    {
                        assert(argNode != nullptr && argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_LDOBJ);
                        argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0;
                        argNode->gtLsraInfo.srcCount = 0;
                    }
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                    continue;
                }

                regNumber argReg = REG_NA;
                regMaskTP argMask = RBM_NONE;
                short regCount = 0;
                bool isOnStack = true;
                if (curArgTabEntry->regNum != REG_STK)
                {
                    isOnStack = false;
                    var_types argType = argNode->TypeGet();

#if FEATURE_VARARG
                    callHasFloatRegArgs |= varTypeIsFloating(argType);
#endif // !FEATURE_VARARG

                    argReg = curArgTabEntry->regNum;
                    regCount = 1;

                    // Default case is that we consume one source; modify this later (e.g. for
                    // promoted structs)
                    info->srcCount++;

                    argMask = genRegMask(argReg);
                    argNode = argNode->gtEffectiveVal();
                }

                // If the struct arg is wraped in CPYBLK the type of the param will be TYP_VOID.
                // Use the curArgTabEntry's isStruct to get whether the param is a struct.
                if (argNode->TypeGet() == TYP_STRUCT 
                    FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(|| curArgTabEntry->isStruct))
                {
                    unsigned originalSize = 0;
                    LclVarDsc* varDsc = nullptr;
                    if (argNode->gtOper == GT_LCL_VAR)
                    {
                        varDsc = compiler->lvaTable + argNode->gtLclVarCommon.gtLclNum;
                        originalSize = varDsc->lvSize();
                    }
                    else if (argNode->gtOper == GT_MKREFANY)
                    {
                        originalSize = 2 * TARGET_POINTER_SIZE;
                    }
                    else if (argNode->gtOper == GT_LDOBJ)
                    {
                        noway_assert(!"GT_LDOBJ not supported for amd64");
                    }
#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
                    else if (argNode->gtOper == GT_PUTARG_REG)
                    {
                        originalSize = genTypeSize(argNode->gtType);
                    }
                    else if (argNode->gtOper == GT_LIST)
                    {
                        originalSize = 0;

                        // There could be up to 2 PUTARG_REGs in the list
                        GenTreeArgList* argListPtr = argNode->AsArgList();
                        unsigned iterationNum = 0;
                        for (; argListPtr; argListPtr = argListPtr->Rest())
                        {
                            GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1;
                            assert(putArgRegNode->gtOper == GT_PUTARG_REG);

                            if (iterationNum == 0)
                            {
                                varDsc = compiler->lvaTable + putArgRegNode->gtOp.gtOp1->gtLclVarCommon.gtLclNum;
                                originalSize = varDsc->lvSize();
                                assert(originalSize != 0);
                            }
                            else
                            {
                                // Need an extra source for every node, but the first in the list.
                                info->srcCount++;

                                // Get the mask for the second putarg_reg
                                argMask = genRegMask(curArgTabEntry->otherRegNum);
                            }

                            putArgRegNode->gtLsraInfo.setDstCandidates(l, argMask);
                            putArgRegNode->gtLsraInfo.setSrcCandidates(l, argMask);

                            // To avoid redundant moves, have the argument child tree computed in the
                            // register in which the argument is passed to the call.
                            putArgRegNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(putArgRegNode));
                            iterationNum++;
                        }

                        assert(iterationNum <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
                    }
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                    else
                    {
                        noway_assert(!"Can't predict unsupported TYP_STRUCT arg kind");
                    }

                    unsigned slots = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES; 
                    unsigned remainingSlots = slots;

                    if (!isOnStack)
                    {
                        remainingSlots = slots - 1;

                        regNumber reg = (regNumber)(argReg + 1);
                        while (remainingSlots > 0 && reg <= REG_ARG_LAST)
                        {
                            argMask |= genRegMask(reg);
                            reg = (regNumber)(reg + 1);
                            remainingSlots--;
                            regCount++;
                        }
                    }

                    short internalIntCount = 0;
                    if (remainingSlots > 0)
                    {
                        // This TYP_STRUCT argument is also passed in the outgoing argument area
                        // We need a register to address the TYP_STRUCT
                        // And we may need 2
#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
                        internalIntCount = 1;
#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
                        internalIntCount = 2;
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                    }
                    argNode->gtLsraInfo.internalIntCount = internalIntCount;

#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
                    if (argNode->gtOper == GT_PUTARG_REG)
                    {
                        argNode->gtLsraInfo.setDstCandidates(l, argMask);
                        argNode->gtLsraInfo.setSrcCandidates(l, argMask);
                    }
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
                }
                else
                {
                    argNode->gtLsraInfo.setDstCandidates(l, argMask);
                    argNode->gtLsraInfo.setSrcCandidates(l, argMask);
                }

                // To avoid redundant moves, have the argument child tree computed in the
                // register in which the argument is passed to the call.
                if (argNode->gtOper == GT_PUTARG_REG)
                {
                    argNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(argNode));
                }

#if FEATURE_VARARG
                // In the case of a varargs call, the ABI dictates that if we have floating point args,
                // we must pass the enregistered arguments in both the integer and floating point registers.
                // Since the integer register is not associated with this arg node, we will reserve it as
                // an internal register so that it is not used during the evaluation of the call node
                // (e.g. for the target).
                if (tree->gtCall.IsVarargs() && varTypeIsFloating(argNode))
                {
                    regNumber targetReg = compiler->getCallArgIntRegister(argReg);
                    tree->gtLsraInfo.setInternalIntCount(tree->gtLsraInfo.internalIntCount + 1);
                    tree->gtLsraInfo.addInternalCandidates(l, genRegMask(targetReg));
                }
#endif // FEATURE_VARARG
            }

            // Now, count stack args
            // Note that these need to be computed into a register, but then
            // they're just stored to the stack - so the reg doesn't
            // need to remain live until the call.  In fact, it must not
            // because the code generator doesn't actually consider it live,
            // so it can't be spilled.

            GenTreePtr args = tree->gtCall.gtCallArgs;
            while (args)
            {
                GenTreePtr arg = args->gtOp.gtOp1;
                if (!(args->gtFlags & GTF_LATE_ARG))
                {                    
                    TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
#if !defined(_TARGET_64BIT_)
                    if (arg->TypeGet() == TYP_LONG)
                    {
                        assert(arg->OperGet() == GT_LONG);
                        GenTreePtr loArg = arg->gtGetOp1();
                        GenTreePtr hiArg = arg->gtGetOp2();
                        assert((loArg->OperGet() == GT_PUTARG_STK) && (hiArg->OperGet() == GT_PUTARG_STK));
                        assert((loArg->gtLsraInfo.dstCount == 1) && (hiArg->gtLsraInfo.dstCount == 1));
                        loArg->gtLsraInfo.isLocalDefUse = true;
                        hiArg->gtLsraInfo.isLocalDefUse = true;
                    }
                    else
#endif // !defined(_TARGET_64BIT_)
                    {
                        if (argInfo->dstCount != 0)
                        {
                            argInfo->isLocalDefUse = true;
                        }

                        // If the child of GT_PUTARG_STK is a constant, we don't need a register to 
                        // move it to memory (stack location).
                        // We don't want to make 0 contained, because we can generate smaller code
                        // by zeroing a register and then storing it.
                        argInfo->dstCount = 0;
                        if (arg->gtOper == GT_PUTARG_STK) 
                        {
                            op1 = arg->gtOp.gtOp1;
                            if (IsContainableImmed(arg, op1) && !op1->IsZero())
                            {
                                MakeSrcContained(arg, op1);
                            }
                        }
                    }
                }
                args = args->gtOp.gtOp2;
            }

#if FEATURE_VARARG
            // If it is a fast tail call, it is already preferenced to use RAX.
            // Therefore, no need set src candidates on call tgt again.
            if (tree->gtCall.IsVarargs() && 
                callHasFloatRegArgs &&                 
                !tree->gtCall.IsFastTailCall() &&
                (ctrlExpr != nullptr))
            {
                // Don't assign the call target to any of the argument registers because
                // we will use them to also pass floating point arguments as required
                // by Amd64 ABI.
                ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS));
            }
#endif // !FEATURE_VARARG
        }
        break;

        case GT_ADDR:
        {
            // For a GT_ADDR, the child node should not be evaluated into a register
            GenTreePtr child = tree->gtOp.gtOp1;
            assert(!l->isCandidateLocalRef(child));
            l->clearDstCount(child);
            info->srcCount = 0;
            info->dstCount = 1;
        }
        break;
#ifdef _TARGET_X86_
        case GT_LDOBJ:
            NYI_X86("GT_LDOBJ");
#endif //_TARGET_X86_

        case GT_INITBLK:
        {
            // Sources are dest address, initVal and size
            info->srcCount = 3;
            info->dstCount = 0;

            GenTreeInitBlk* initBlkNode = tree->AsInitBlk();

            GenTreePtr blockSize = initBlkNode->Size();
            GenTreePtr   dstAddr = initBlkNode->Dest();
            GenTreePtr   initVal = initBlkNode->InitVal();

            // If we have an InitBlk with constant block size we can optimize several ways:
            // a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes 
            //    we use rep stosb since this reduces the register pressure in LSRA and we have
            //    roughly the same performance as calling the helper.
            // b) If the size is <= INITBLK_UNROLL_LIMIT bytes and the fill byte is a constant, 
            //    we can speed this up by unrolling the loop using SSE2 stores.  The reason for
            //    this threshold is because our last investigation (Fall 2013), more than 95% of initblks 
            //    in our framework assemblies are actually <= INITBLK_UNROLL_LIMIT bytes size, so this is the
            //    preferred code sequence for the vast majority of cases.

            // This threshold will decide from using the helper or let the JIT decide to inline
            // a code sequence of its choice.
            ssize_t helperThreshold = max(INITBLK_STOS_LIMIT, INITBLK_UNROLL_LIMIT);

            if (blockSize->IsCnsIntOrI() && blockSize->gtIntCon.gtIconVal <= helperThreshold)
            {
                ssize_t size = blockSize->gtIntCon.gtIconVal;

                // Always favor unrolling vs rep stos.
                if (size <= INITBLK_UNROLL_LIMIT && initVal->IsCnsIntOrI())
                {
                    // Replace the integer constant in initVal 
                    // to fill an 8-byte word with the fill value of the InitBlk
                    assert(initVal->gtIntCon.gtIconVal == (initVal->gtIntCon.gtIconVal & 0xFF));
#ifdef _TARGET_AMD64_
                    if (size < REGSIZE_BYTES)
                    {
                        initVal->gtIntCon.gtIconVal = 0x01010101 * initVal->gtIntCon.gtIconVal;
                    }
                    else
                    {
                        initVal->gtIntCon.gtIconVal = 0x0101010101010101LL * initVal->gtIntCon.gtIconVal;
                        initVal->gtType = TYP_LONG;
                    }
#else // !_TARGET_AMD64_
                    initVal->gtIntCon.gtIconVal = 0x01010101 * initVal->gtIntCon.gtIconVal;
#endif // !_TARGET_AMD64_

                    MakeSrcContained(tree, blockSize);

                    // In case we have a buffer >= 16 bytes
                    // we can use SSE2 to do a 128-bit store in a single
                    // instruction.
                    if (size >= XMM_REGSIZE_BYTES)
                    {
                        // Reserve an XMM register to fill it with 
                        // a pack of 16 init value constants.
                        info->internalFloatCount = 1;
                        info->setInternalCandidates(l, l->internalFloatRegCandidates());
                    }
                    initBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindUnroll;
                }
                else
                {
                    // rep stos has the following register requirements:
                    // a) The memory address to be in RDI.
                    // b) The fill value has to be in RAX.
                    // c) The buffer size must be in RCX.
                    dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_RDI);
                    initVal->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
                    blockSize->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
                    initBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindRepInstr;
                }
            }
            else
            {
#ifdef _TARGET_AMD64_
                // The helper follows the regular AMD64 ABI.
                dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_ARG_0);
                initVal->gtLsraInfo.setSrcCandidates(l, RBM_ARG_1);
                blockSize->gtLsraInfo.setSrcCandidates(l, RBM_ARG_2);
                initBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindHelper;
#else // !_TARGET_AMD64_
                NYI("InitBlk helper call for RyuJIT/x86");
#endif // !_TARGET_AMD64_
            }
            break;
        }

        case GT_COPYOBJ:
        {
            // Sources are src, dest and size (or class token for CpObj).
            info->srcCount = 3;
            info->dstCount = 0;
            
            GenTreeCpObj* cpObjNode = tree->AsCpObj();
            
            GenTreePtr  clsTok = cpObjNode->ClsTok();
            GenTreePtr dstAddr = cpObjNode->Dest();
            GenTreePtr srcAddr = cpObjNode->Source();
            
            unsigned slots = cpObjNode->gtSlots;

#ifdef DEBUG
            // CpObj must always have at least one GC-Pointer as a member.
            assert(cpObjNode->gtGcPtrCount > 0);
            
            assert(dstAddr->gtType == TYP_BYREF || dstAddr->gtType == TYP_I_IMPL);
            assert(clsTok->IsIconHandle());
            
            CORINFO_CLASS_HANDLE clsHnd = (CORINFO_CLASS_HANDLE)clsTok->gtIntCon.gtIconVal;
            size_t classSize = compiler->info.compCompHnd->getClassSize(clsHnd);
            size_t blkSize = roundUp(classSize, TARGET_POINTER_SIZE);
            
            // Currently, the EE always round up a class data structure so 
            // we are not handling the case where we have a non multiple of pointer sized 
            // struct. This behavior may change in the future so in order to keeps things correct
            // let's assert it just to be safe. Going forward we should simply
            // handle this case.
            assert(classSize == blkSize);
            assert((blkSize / TARGET_POINTER_SIZE) == slots);
            assert((cpObjNode->gtFlags & GTF_BLK_HASGCPTR) != 0);
#endif

            bool IsRepMovsProfitable = false;
            
            // If the destination is not on the stack, let's find out if we
            // can improve code size by using rep movsq instead of generating
            // sequences of movsq instructions.
            if (!dstAddr->OperIsLocalAddr())
            {
                // Let's inspect the struct/class layout and determine if it's profitable
                // to use rep movsq for copying non-gc memory instead of using single movsq 
                // instructions for each memory slot.
                unsigned i = 0;
                BYTE* gcPtrs = cpObjNode->gtGcPtrs;
                
                do {
                    unsigned nonGCSlots = 0;
                    // Measure a contiguous non-gc area inside the struct and note the maximum.
                    while (i < slots && gcPtrs[i] == TYPE_GC_NONE)
                    {
                        nonGCSlots++;
                        i++;
                    }
                    
                    while (i < slots && gcPtrs[i] != TYPE_GC_NONE)
                    {
                        i++;
                    }

                    if (nonGCSlots >= CPOBJ_NONGC_SLOTS_LIMIT)
                    {
                        IsRepMovsProfitable = true;
                        break;
                    }
                } while (i < slots);
            }
            else if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
            {
                IsRepMovsProfitable = true;
            }

            // There are two cases in which we need to materialize the 
            // struct size:
            // a) When the destination is on the stack we don't need to use the 
            //    write barrier, we can just simply call rep movsq and get a win in codesize.
            // b) If we determine we have contiguous non-gc regions in the struct where it's profitable
            //    to use rep movsq instead of a sequence of single movsq instructions.  According to the
            //    Intel Manual, the sweet spot for small structs is between 4 to 12 slots of size where
            //    the entire operation takes 20 cycles and encodes in 5 bytes (moving RCX, and calling rep movsq).
            if (IsRepMovsProfitable)
            {
                // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
                MakeSrcContained(tree, clsTok);
                info->internalIntCount = 1;
                info->setInternalCandidates(l, RBM_RCX);
            }
            else
            {
                // We don't need to materialize the struct size because we will unroll
                // the loop using movsq that automatically increments the pointers.
                MakeSrcContained(tree, clsTok);
            }

            dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_RDI);
            srcAddr->gtLsraInfo.setSrcCandidates(l, RBM_RSI);
        }
        break;

#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
        case GT_PUTARG_STK:
        {
            if (tree->TypeGet() != TYP_STRUCT)
            {
                TreeNodeInfoInitSimple(tree, info, kind);
                break;
            }

            GenTreePutArgStk* putArgStkTree = tree->AsPutArgStk();

            GenTreePtr   dstAddr = tree;
            GenTreePtr   srcAddr = tree->gtOp.gtOp1;

            assert(srcAddr->OperGet() == GT_LDOBJ);
            info->srcCount =  srcAddr->gtLsraInfo.dstCount;

            // If this is a stack variable address,
            // make the op1 contained, so this way 
            // there is no unnecessary copying between registers.
            // To avoid assertion, increment the parent's source.
            // It is recovered below.
            if (srcAddr->gtGetOp1()->OperIsLocalAddr())
            {
                info->srcCount += 1;
            }

            info->dstCount = 0;

            // In case of a CpBlk we could use a helper call. In case of putarg_stk we 
            // can't do that since the helper call could kill some already set up outgoing args.
            // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
            // The cpyXXXX code is rather complex and this could cause it to be more complex, but
            // it might be the right thing to do.

            // This threshold will decide from using the helper or let the JIT decide to inline
            // a code sequence of its choice.
            ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
            ssize_t size = putArgStkTree->gtNumSlots * TARGET_POINTER_SIZE;

            // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
            // (I don't know which).

            // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2. 
            // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
            // our framework assemblies, so this is the main code generation scheme we'll use.
            if (size <= CPBLK_UNROLL_LIMIT && putArgStkTree->gtNumberReferenceSlots == 0)
            {
                // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
                // 
                // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
                // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
                // RBM_NON_BYTE_REGS from internal candidates.
                if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
                {
                    info->internalIntCount++;
                    regMaskTP regMask = l->allRegs(TYP_INT);

#ifdef _TARGET_X86_
                    if ((size % 2) != 0)
                    {
                        regMask &= ~RBM_NON_BYTE_REGS;
                    }
#endif
                    info->setInternalCandidates(l, regMask);
                }

                if (size >= XMM_REGSIZE_BYTES)
                {
                    // If we have a buffer larger than XMM_REGSIZE_BYTES, 
                    // reserve an XMM register to use it for a 
                    // series of 16-byte loads and stores.
                    info->internalFloatCount = 1;
                    info->addInternalCandidates(l, l->internalFloatRegCandidates());
                }

                if (srcAddr->gtGetOp1()->OperIsLocalAddr())
                {
                    MakeSrcContained(putArgStkTree, srcAddr->gtGetOp1());
                }

                // If src or dst are on stack, we don't have to generate the address into a register
                // because it's just some constant+SP
                putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindUnroll;
            }
            else
            {
                info->internalIntCount += 3;
                info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI));
                if (srcAddr->gtGetOp1()->OperIsLocalAddr())
                {
                    MakeSrcContained(putArgStkTree, srcAddr->gtGetOp1());
                }

                putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindRepInstr;
            }

            // Always mark the LDOBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
            MakeSrcContained(putArgStkTree, srcAddr);

            // Balance up the inc above.
            if (srcAddr->gtGetOp1()->OperIsLocalAddr())
            {
                info->srcCount -= 1;
            }
        }
        
        break;
#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING

        case GT_COPYBLK:
        {
            // Sources are src, dest and size (or class token for CpObj).
            info->srcCount = 3;
            info->dstCount = 0;

            GenTreeCpBlk* cpBlkNode = tree->AsCpBlk();

            GenTreePtr blockSize = cpBlkNode->Size();
            GenTreePtr   dstAddr = cpBlkNode->Dest();
            GenTreePtr   srcAddr = cpBlkNode->Source();

            // In case of a CpBlk with a constant size and less than CPBLK_MOVS_LIMIT size
            // we can use rep movs to generate code instead of the helper call.

            // This threshold will decide from using the helper or let the JIT decide to inline
            // a code sequence of its choice.
            ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);

            // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
            // (I don't know which).
#ifdef _TARGET_AMD64_
            if (blockSize->IsCnsIntOrI() && blockSize->gtIntCon.gtIconVal <= helperThreshold)
#endif // _TARGET_AMD64_
            {
                assert(!blockSize->IsIconHandle());
                ssize_t size = blockSize->gtIntCon.gtIconVal;

                // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2. 
                // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
                // our framework assemblies, so this is the main code generation scheme we'll use.
                if (size <= CPBLK_UNROLL_LIMIT)
                {
                    MakeSrcContained(tree, blockSize);
                    
                    // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
                    // 
                    // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
                    // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
                    // RBM_NON_BYTE_REGS from internal candidates.
                    if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
                    {
                        info->internalIntCount++;
                        regMaskTP regMask = l->allRegs(TYP_INT);

#ifdef _TARGET_X86_
                        if ((size % 2) != 0)
                        {
                            regMask &= ~RBM_NON_BYTE_REGS;
                        }
#endif
                        info->setInternalCandidates(l, regMask);
                    }

                    if (size >= XMM_REGSIZE_BYTES)
                    {
                        // If we have a buffer larger than XMM_REGSIZE_BYTES, 
                        // reserve an XMM register to use it for a 
                        // series of 16-byte loads and stores.
                        info->internalFloatCount = 1;
                        info->addInternalCandidates(l, l->internalFloatRegCandidates());
                    }

                    // If src or dst are on stack, we don't have to generate the address into a register
                    // because it's just some constant+SP
                    if (srcAddr->OperIsLocalAddr())
                    {
                        MakeSrcContained(tree, srcAddr);
                    }

                    if (dstAddr->OperIsLocalAddr())
                    {
                        MakeSrcContained(tree, dstAddr);
                    }
                            
                    cpBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindUnroll;
                }
                else
                {
                    dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_RDI);
                    srcAddr->gtLsraInfo.setSrcCandidates(l, RBM_RSI);
                    blockSize->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
                    cpBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindRepInstr;
                }
            }
#ifdef _TARGET_AMD64_
            else
            {
                // In case we have a constant integer this means we went beyond
                // CPBLK_MOVS_LIMIT bytes of size, still we should never have the case of
                // any GC-Pointers in the src struct.
                if (blockSize->IsCnsIntOrI())
                {
                    assert(!blockSize->IsIconHandle());
                }

                dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_ARG_0);
                srcAddr->gtLsraInfo.setSrcCandidates(l, RBM_ARG_1);
                blockSize->gtLsraInfo.setSrcCandidates(l, RBM_ARG_2);
                cpBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindHelper;
            }
#endif // _TARGET_AMD64_
        }
        break;

        case GT_LCLHEAP:
        {
            info->srcCount = 1;
            info->dstCount = 1;

            // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
            // Here '-' means don't care.
            //
            //     Size?                    Init Memory?         # temp regs
            //      0                            -                  0
            //      const and <=6 ptr words      -                  0
            //      const and <PageSize          No                 0
            //      >6 ptr words                 Yes                hasPspSym ? 1 : 0
            //      Non-const                    Yes                hasPspSym ? 1 : 0
            //      Non-const                    No                 2            
            //
            // PSPSym - If the method has PSPSym increment internalIntCount by 1.
            //
            bool hasPspSym;           
#if FEATURE_EH_FUNCLETS
            hasPspSym = (compiler->lvaPSPSym != BAD_VAR_NUM);
#else
            hasPspSym = false;
#endif

            GenTreePtr size = tree->gtOp.gtOp1;
            if (size->IsCnsIntOrI())
            {
                MakeSrcContained(tree, size);

                size_t sizeVal = size->gtIntCon.gtIconVal;

                if (sizeVal == 0)
                {
                    info->internalIntCount = 0;
                }
                else 
                {
                    // Compute the amount of memory to properly STACK_ALIGN.
                    // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
                    // This should also help in debugging as we can examine the original size specified with localloc.
                    sizeVal = AlignUp(sizeVal, STACK_ALIGN);
                    size_t cntStackAlignedWidthItems = (sizeVal >> STACK_ALIGN_SHIFT);

                    // For small allocations upto 6 pointer sized words (i.e. 48 bytes of localloc)
                    // we will generate 'push 0'.
                    if (cntStackAlignedWidthItems <= 6)
                    {
                        info->internalIntCount = 0;
                    }
                    else if (!compiler->info.compInitMem)
                    {
                        // No need to initialize allocated stack space.
                        if (sizeVal < CORINFO_PAGE_SIZE)
                        {
                            info->internalIntCount = 0;
                        }
                        else
                        {
                            // We need two registers: regCnt and RegTmp
                            info->internalIntCount = 2;
                        }
                    }
                    else
                    {
                        // >6 and need to zero initialize allocated stack space.
                        // If the method has PSPSym, we need an internal register to hold regCnt
                        // since targetReg allocated to GT_LCLHEAP node could be the same as one of
                        // the the internal registers.
                        info->internalIntCount = hasPspSym ? 1 : 0;
                    }
                }
            }
            else
            {
                if (!compiler->info.compInitMem)
                {
                    info->internalIntCount = 2;
                }
                else
                {
                    // If the method has PSPSym, we need an internal register to hold regCnt
                    // since targetReg allocated to GT_LCLHEAP node could be the same as one of
                    // the the internal registers.
                    info->internalIntCount = hasPspSym ? 1 : 0;
                }
            }

            // If the method has PSPSym, we would need an addtional register to relocate it on stack.
            if (hasPspSym)
            {      
                // Exclude const size 0
                if (!size->IsCnsIntOrI() || (size->gtIntCon.gtIconVal > 0))
                    info->internalIntCount++;
            }
        }
        break;

        case GT_ARR_BOUNDS_CHECK:
#ifdef FEATURE_SIMD
        case GT_SIMD_CHK:
#endif // FEATURE_SIMD
        {
            GenTreeBoundsChk* node = tree->AsBoundsChk();
            // Consumes arrLen & index - has no result
            info->srcCount = 2;
            info->dstCount = 0;

            GenTree* intCns = nullptr;
            GenTree* other = nullptr;
            if (CheckImmedAndMakeContained(tree, node->gtIndex))
            {
                intCns = node->gtIndex;
                other = node->gtArrLen;
            }
            else if (CheckImmedAndMakeContained(tree, node->gtArrLen))
            {
                intCns = node->gtArrLen;
                other = node->gtIndex;
            }
            else 
            {
                other = node->gtIndex;
            }

            if (other->isMemoryOp())
            {
                MakeSrcContained(tree, other);
            }
                
        }
        break;

        case GT_ARR_ELEM:
            // These must have been lowered to GT_ARR_INDEX
            noway_assert(!"We should never see a GT_ARR_ELEM in lowering");
            info->srcCount = 0;
            info->dstCount = 0;
            break;

        case GT_ARR_INDEX:
            info->srcCount = 2;
            info->dstCount = 1;
            // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
            // times while the result is being computed.
            tree->AsArrIndex()->ArrObj()->gtLsraInfo.isDelayFree = true;
            info->hasDelayFreeSrc = true;
            break;

        case GT_ARR_OFFSET:
            // This consumes the offset, if any, the arrObj and the effective index,
            // and produces the flattened offset for this dimension.
            info->srcCount = 3;
            info->dstCount = 1;
            info->internalIntCount = 1;
            // we don't want to generate code for this
            if (tree->gtArrOffs.gtOffset->IsZero())
            {
                MakeSrcContained(tree, tree->gtArrOffs.gtOffset);
            }
            break;

        case GT_LEA:
            // The LEA usually passes its operands through to the GT_IND, in which case we'll
            // clear the info->srcCount and info->dstCount later, but we may be instantiating an address,
            // so we set them here.
            info->srcCount = 0;
            if (tree->AsAddrMode()->Base() != nullptr)
            {
                info->srcCount++;
            }
            if (tree->AsAddrMode()->Index() != nullptr)
            {
                info->srcCount++;
            }
            info->dstCount = 1;
            break;

        case GT_STOREIND:
        {
            info->srcCount = 2;
            info->dstCount = 0;
            GenTree* src = tree->gtOp.gtOp2;

            if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
            {
                LowerGCWriteBarrier(tree);
                break;
            }

            // If the source is a containable immediate, make it contained, unless it is
            // an int-size or larger store of zero to memory, because we can generate smaller code
            // by zeroing a register and then storing it.
            if (IsContainableImmed(tree, src) &&
                (!src->IsZero() || varTypeIsSmall(tree) || tree->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
            {
                MakeSrcContained(tree, src);
            }

            // Perform recognition of trees with the following structure:
            // StoreInd(IndA, BinOp(expr, IndA))
            // to be able to fold this into an instruction of the form
            // BINOP [addressing mode for IndA], register
            // where register is the actual place where 'expr'
            // is computed.
            //
            // SSE2 doesn't support RMW form of instructions.
            if (!varTypeIsFloating(tree) && LowerStoreInd(tree))
                break;

            GenTreePtr addr = tree->gtOp.gtOp1;

            HandleIndirAddressExpression(tree, addr);
        }
        break;
        
        case GT_NULLCHECK:
            info->isLocalDefUse = true;

            __fallthrough;

        case GT_IND:
        {
            info->dstCount = tree->OperGet() == GT_NULLCHECK ? 0 : 1;
            info->srcCount = 1;
            
            GenTreePtr addr = tree->gtOp.gtOp1;
            
            HandleIndirAddressExpression(tree, addr);
        }
        break;

        case GT_CATCH_ARG:
            info->srcCount = 0;
            info->dstCount = 1;
            info->setDstCandidates(l, RBM_EXCEPTION_OBJECT);
            break;

#if !FEATURE_EH_FUNCLETS
        case GT_END_LFIN:
            NYI_X86("Implement GT_END_LFIN for x86");
#endif

        case GT_CLS_VAR:
            info->srcCount = 0;
            // GT_CLS_VAR, by the time we reach the backend, must always
            // be a pure use.
            // It will produce a result of the type of the
            // node, and use an internal register for the address.

            info->dstCount = 1;
            assert((tree->gtFlags & (GTF_VAR_DEF|GTF_VAR_USEASG|GTF_VAR_USEDEF)) == 0);
            info->internalIntCount = 1;
            break;
        } // end switch (tree->OperGet())

        if (tree->OperIsBinary() && info->srcCount >= 2)
        {
            if (isRMWRegOper(tree))
            {
                GenTree* op1 = tree->gtOp.gtOp1;
                GenTree* op2 = tree->gtOp.gtOp2;

                // If we have a read-modify-write operation, we want to preference op1 to the target.
                // If op1 is contained, we don't want to preference it, but it won't
                // show up as a source in that case, so it will be ignored.
                op1->gtLsraInfo.isTgtPref = true;

                // Is this a non-commutative operator, or is op2 a contained memory op?
                // (Note that we can't call IsContained() at this point because it uses exactly the
                // same information we're currently computing.)
                // In either case, we need to make op2 remain live until the op is complete, by marking
                // the source(s) associated with op2 as "delayFree".
                // Note that if op2 of a binary RMW operator is a memory op, even if the operator
                // is commutative, codegen cannot reverse them.
                // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
                // more work to be done to correctly reverse the operands if they involve memory
                // operands.  Also, we may need to handle more cases than GT_IND, especially once
                // we've modified the register allocator to not require all nodes to be assigned
                // a register (e.g. a spilled lclVar can often be referenced directly from memory).
                // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.

                GenTree* delayUseSrc = nullptr;
                // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
                // to special case them.
                if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD)
                {
                    delayUseSrc = op1;
                }
                else if ((op2 != nullptr) &&
                         (!tree->OperIsCommutative() || (op2->isMemoryOp() && (op2->gtLsraInfo.srcCount == 0))))
                {
                    delayUseSrc = op2;
                }
                if (delayUseSrc != nullptr)
                {
                    // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree'
                    // on the base & index, if any.
                    // Otherwise, we set it on delayUseSrc itself.
                    if (delayUseSrc->isIndir() && (delayUseSrc->gtLsraInfo.dstCount == 0))
                    {
                        GenTree* base = delayUseSrc->AsIndir()->Base();
                        GenTree* index = delayUseSrc->AsIndir()->Index();
                        if (base != nullptr)
                        {
                            base->gtLsraInfo.isDelayFree = true;
                        }
                        if (index != nullptr)
                        {
                            index->gtLsraInfo.isDelayFree = true;
                        }
                    }
                    else
                    {
                        delayUseSrc->gtLsraInfo.isDelayFree = true;
                    }
                    info->hasDelayFreeSrc = true;
                }
            }
        }

#ifdef _TARGET_X86_
        // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
        // if the tree node is a byte type.
        // 
        // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
        // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
        // value. In this case we need to exclude esi/edi from the src candidates of op2.
        // 
        // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
        //
        // Though this looks conservative in theory, in practice we could not think of a case where
        // the below logic leads to conservative register specification.  In future when or if we find
        // one such case, this logic needs to be fine tuned for that case(s).
        if (varTypeIsByte(tree) || ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType())))
        {
            regMaskTP regMask;
            if (info->dstCount > 0)
            {
                regMask = info->getDstCandidates(l);
                assert(regMask != RBM_NONE);
                info->setDstCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
            }

            if (info->srcCount > 0)
            {
                // No need to set src candidates on a contained child operand.
                GenTree *op = tree->gtOp.gtOp1;
                assert(op != nullptr);
                bool containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
                if (!containedNode)
                {
                    regMask = op->gtLsraInfo.getSrcCandidates(l);
                    assert(regMask != RBM_NONE);
                    op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
                }

                op = tree->gtOp.gtOp2;
                if (op != nullptr)
                {
                    containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);                
                    if (!containedNode)
                    {
                        regMask = op->gtLsraInfo.getSrcCandidates(l);
                        assert(regMask != RBM_NONE);
                        op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
                    }
                }
            }
        }
#endif //_TARGET_X86_

        tree = next;

        // We need to be sure that we've set info->srcCount and info->dstCount appropriately
        assert(info->dstCount < 2);
    }
}

#ifdef FEATURE_SIMD
//------------------------------------------------------------------------
// TreeNodeInfoInitSIMD: Set the NodeInfo for a GT_SIMD tree.
//
// Arguments:
//    tree       - The GT_SIMD node of interest
//
// Return Value:
//    None.

void
Lowering::TreeNodeInfoInitSIMD(GenTree* tree, LinearScan* lsra)
{
    GenTreeSIMD* simdTree = tree->AsSIMD();
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    info->dstCount = 1;
    switch(simdTree->gtSIMDIntrinsicID)
    {
        GenTree* op2;

    case SIMDIntrinsicInit:
        {
            info->srcCount = 1;
            GenTree* op1 = tree->gtOp.gtOp1;

            // This sets all fields of a SIMD struct to the given value.
            // Mark op1 as contained if it is either zero or int constant of all 1's,
            // or a float constant with 16 or 32 byte simdType (AVX case)
            //
            // Should never see small int base type vectors except for zero initialization.
            assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsZero());

            if (op1->IsZero() || 
                (simdTree->gtSIMDBaseType == TYP_INT && op1->IsCnsIntOrI() && op1->AsIntConCommon()->IconValue() == 0xffffffff) ||
                (simdTree->gtSIMDBaseType == TYP_LONG && op1->IsCnsIntOrI() && op1->AsIntConCommon()->IconValue() == 0xffffffffffffffffLL)
               )
            {
                MakeSrcContained(tree, tree->gtOp.gtOp1);
                info->srcCount = 0;
            }            
            else if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) &&
                     ((simdTree->gtSIMDSize == 16) || (simdTree->gtSIMDSize == 32)))
            {
                // Either op1 is a float or dbl constant or an addr
                if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
                {
                    MakeSrcContained(tree, tree->gtOp.gtOp1);
                    info->srcCount = 0;
                }
            }
        }
        break;

    case SIMDIntrinsicInitN:
        {
            info->srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(simdTree->gtSIMDBaseType));

            // Need an internal register to stitch together all the values into a single vector in a SIMD reg
            info->internalFloatCount = 1;
            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
        }
        break;

    case SIMDIntrinsicInitArray:
        // We have an array and an index, which may be contained.
        info->srcCount = 2;
        CheckImmedAndMakeContained(tree,  tree->gtGetOp2());
        break;

    case SIMDIntrinsicDiv:
        // SSE2 has no instruction support for division on integer vectors
        noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
        info->srcCount = 2;
        break;

    case SIMDIntrinsicAbs:
        // This gets implemented as bitwise-And operation with a mask
        // and hence should never see it here.
        unreached();
        break;

    case SIMDIntrinsicSqrt:
        // SSE2 has no instruction support for sqrt on integer vectors.
        noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
        info->srcCount = 1;
        break;

    case SIMDIntrinsicAdd:
    case SIMDIntrinsicSub:
    case SIMDIntrinsicMul:    
    case SIMDIntrinsicBitwiseAnd:
    case SIMDIntrinsicBitwiseAndNot:
    case SIMDIntrinsicBitwiseOr:
    case SIMDIntrinsicBitwiseXor:
    case SIMDIntrinsicMin:
    case SIMDIntrinsicMax:
        info->srcCount = 2;

        // SSE2 32-bit integer multiplication requires two temp regs
        if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && 
            simdTree->gtSIMDBaseType == TYP_INT)
        {
            info->internalFloatCount = 2;
            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
        }
        break;

    case SIMDIntrinsicEqual:
        info->srcCount = 2;
        break;

    // SSE2 doesn't support < and <= directly on int vectors.
    // Instead we need to use > and >= with swapped operands.
    case SIMDIntrinsicLessThan:
    case SIMDIntrinsicLessThanOrEqual:
        info->srcCount = 2;
        noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
        break;

    // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
    // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
    // Instead we need to use <  and <= with swapped operands.
    case SIMDIntrinsicGreaterThan:
        noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
        info->srcCount = 2;
        break;

    case SIMDIntrinsicOpEquality:
    case SIMDIntrinsicOpInEquality:
        // Need two SIMD registers as scratch.
        // See genSIMDIntrinsicRelOp() for details on code sequence generate and
        // the need for two scratch registers.
        info->srcCount = 2;
        info->internalFloatCount = 2;
        info->setInternalCandidates(lsra, lsra->allSIMDRegs());
        break;

    case SIMDIntrinsicDotProduct:
        if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) || (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
        {
            // For SSE, or AVX with 32-byte vectors, we also need an internal register as scratch.
            // Further we need the targetReg and internal reg to be distinct registers.
            // This is achieved by requesting two internal registers; thus one of them
            // will be different from targetReg.
            // Note that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
            //
            // See genSIMDIntrinsicDotProduct() for details on code sequence generated and
            // the need for scratch registers.
            info->internalFloatCount = 2;
            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
        }
        info->srcCount = 2;
        break;

    case SIMDIntrinsicGetItem:
        // This implements get_Item method. The sources are:
        //  - the source SIMD struct
        //  - index (which element to get)
        // The result is baseType of SIMD struct.
        info->srcCount = 2;
        op2 = tree->gtOp.gtOp2;

        // If the index is a constant, mark it as contained.
        if (CheckImmedAndMakeContained(tree, op2))
        {
            info->srcCount = 1;
        }

        // If the index is not a constant, we will use the SIMD temp location to store the vector.
        // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
        // can use that in the process of extracting the element.
        // 
        // If the index is a constant and base type is a small int we can use pextrw, but on AVX
        // we will need a temp if are indexing into the upper half of the AVX register.
        // In all other cases with constant index, we need a temp xmm register to extract the 
        // element if index is other than zero.

        if (!op2->IsCnsIntOrI())
        {
            (void) comp->getSIMDInitTempVarNum();
        }
        else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
        {
            bool needFloatTemp;
            if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) && (comp->getSIMDInstructionSet() == InstructionSet_AVX))
            {
                int byteShiftCnt = (int) op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
                needFloatTemp = (byteShiftCnt >= 16);
            }
            else
            {
                needFloatTemp = !op2->IsZero();
            }
            if (needFloatTemp)
            {
                info->internalFloatCount = 1;
                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
            }
        }
        break;

    case SIMDIntrinsicSetX:
    case SIMDIntrinsicSetY:
    case SIMDIntrinsicSetZ:
    case SIMDIntrinsicSetW:
        // We need an internal integer register
        info->srcCount = 2;
        info->internalIntCount = 1;
        info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT));
        break;    

    case SIMDIntrinsicCast:
        info->srcCount = 1;
        break;

    case SIMDIntrinsicShuffleSSE2:
        info->srcCount = 2;
        // Second operand is an integer constant and marked as contained.
        op2 = tree->gtOp.gtOp2;
        noway_assert(op2->IsCnsIntOrI());
        MakeSrcContained(tree, op2);
        break;

    case SIMDIntrinsicGetX:
    case SIMDIntrinsicGetY:
    case SIMDIntrinsicGetZ:
    case SIMDIntrinsicGetW:
    case SIMDIntrinsicGetOne:
    case SIMDIntrinsicGetZero:
    case SIMDIntrinsicGetCount:
    case SIMDIntrinsicGetAllOnes:
        assert(!"Get intrinsics should not be seen during Lowering.");
        unreached();

    default:
        noway_assert(!"Unimplemented SIMD node type.");
        unreached();
    }
}
#endif // FEATURE_SIMD

void Lowering::LowerGCWriteBarrier(GenTree* tree)
{
    GenTreePtr dst  = tree;
    GenTreePtr addr = tree->gtOp.gtOp1;
    GenTreePtr src  = tree->gtOp.gtOp2;

    if (addr->OperGet() == GT_LEA)
    {
        // In the case where we are doing a helper assignment, if the dst
        // is an indir through an lea, we need to actually instantiate the
        // lea in a register
        GenTreeAddrMode* lea = addr->AsAddrMode();

        short leaSrcCount = 0;
        if (lea->Base() != nullptr)
        {
            leaSrcCount++;
        }
        if (lea->Index() != nullptr)
        {
            leaSrcCount++;
        }
        lea->gtLsraInfo.srcCount = leaSrcCount;
        lea->gtLsraInfo.dstCount = 1;
    }

    // !!! This code was leveraged from codegen.cpp
#if NOGC_WRITE_BARRIERS
#ifdef _TARGET_AMD64_
#error "NOGC_WRITE_BARRIERS is not supported for _TARGET_AMD64"
#else // !_TARGET_AMD64_
    NYI("NYI: NOGC_WRITE_BARRIERS for RyuJIT/x86");
#endif // !_TARGET_AMD64_
#endif // NOGC_WRITE_BARRIERS
    // For the standard JIT Helper calls
    // op1 goes into REG_ARG_0 and
    // op2 goes into REG_ARG_1
    // Set this RefPosition, and the previous one, to the physical
    // register instead of a virtual one
    //
    addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0);
    src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1);
    // Both src and dst must reside in a register, which they should since we haven't set
    // either of them as contained.
    assert(addr->gtLsraInfo.dstCount == 1);
    assert(src->gtLsraInfo.dstCount == 1);
}


void Lowering::HandleIndirAddressExpression(GenTree* indirTree, GenTree* addr)
{
    GenTree* base = nullptr;
    GenTree* index = nullptr;
    unsigned mul, cns;
    bool rev;
    bool modifiedSources = false;
    TreeNodeInfo* info = &(indirTree->gtLsraInfo);

    // If indirTree is of TYP_SIMD12, don't mark addr as contained
    // so that it always get computed to a register.  This would
    // mean codegen side logic doesn't need to handle all possible
    // addr expressions that could be contained.
    // 
    // TODO-XArch-CQ: handle other addr mode expressions that could be marked
    // as contained.
#ifdef FEATURE_SIMD
    if (indirTree->TypeGet() == TYP_SIMD12)
    {
        // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
        // To assemble the vector properly we would need an additional
        // XMM register.
        info->internalFloatCount = 1;

        // In case of GT_IND we need an internal register different from targetReg and
        // both of the registers are used at the same time. This achieved by reserving
        // two internal registers
        if (indirTree->OperGet() == GT_IND)
        {
            (info->internalFloatCount)++;
        }

        info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());

        return ;
    }
#endif //FEATURE_SIMD

    // These nodes go into an addr mode:
    // - GT_CLS_VAR_ADDR turns into a constant.
    // - GT_LCL_VAR_ADDR is a stack addr mode.
    if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
    {
        // make this contained, it turns into a constant that goes into an addr mode
        MakeSrcContained(indirTree, addr);
    }

    // TODO-XArch-CQ: The below condition is incorrect and need to be revisited for the following reasons:  
    // a) FitsInAddrBase() already checks for opts.compReloc and
    // b) opts.compReloc is set only during Ngen.  
    // c) During lowering we should not be checking gtRegNum
    // For the above reasons this condition will never be true and indir of absolute addresses
    // that can be encoded as PC-relative 32-bit offset are never marked as contained.
    // 
    // The right condition to check probably here is
    // "addr->IsCnsIntOrI() && comp->codeGen->genAddrShouldUsePCRel(addr->AsIntConCommon()->IconValue())"
    //
    // Apart from making this change, codegen side changes are needed to handle contained addr
    // where GT_IND is possible as an operand.
    else if (addr->IsCnsIntOrI() &&
             addr->AsIntConCommon()->FitsInAddrBase(comp) &&
             comp->opts.compReloc &&
             (addr->gtRegNum != REG_NA))
    {
        MakeSrcContained(indirTree, addr);
    }
    else if (addr->OperGet() == GT_LEA)
    {
        GenTreeAddrMode* lea = addr->AsAddrMode();
        base  = lea->Base();
        index = lea->Index();

        m_lsra->clearOperandCounts(addr);
        // The srcCount is decremented because addr is now "contained", 
        // then we account for the base and index below, if they are non-null.    
        info->srcCount--;
    }
    else if (comp->codeGen->genCreateAddrMode(addr, -1, true, 0, &rev, &base, &index, &mul, &cns, true /*nogen*/)
        && !(modifiedSources = AreSourcesPossiblyModified(indirTree, base, index)))
    {
        // An addressing mode will be constructed that may cause some
        // nodes to not need a register, and cause others' lifetimes to be extended
        // to the GT_IND or even its parent if it's an assignment

        assert(base != addr);
        m_lsra->clearOperandCounts(addr);

        GenTreePtr arrLength = nullptr;

        // Traverse the computation below GT_IND to find the operands
        // for the addressing mode, marking the various constants and
        // intermediate results as not consuming/producing.
        // If the traversal were more complex, we might consider using
        // a traversal function, but the addressing mode is only made
        // up of simple arithmetic operators, and the code generator
        // only traverses one leg of each node.

        bool foundBase = (base == nullptr);
        bool foundIndex = (index == nullptr);
        GenTreePtr nextChild = nullptr;
        for (GenTreePtr child = addr;
             child != nullptr && !child->OperIsLeaf();
             child = nextChild)
        {
            nextChild = nullptr;
            GenTreePtr op1 = child->gtOp.gtOp1;
            GenTreePtr op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr;

            if (op1 == base)
            {
                foundBase = true;
            }
            else if (op1 == index)
            {
                foundIndex = true;
            }
            else
            {
                m_lsra->clearOperandCounts(op1);
                if (!op1->OperIsLeaf())
                {
                    nextChild = op1;
                }
            }

            if (op2 != nullptr)
            {
                if (op2 == base)
                {
                    foundBase = true;
                }
                else if (op2 == index)
                {
                    foundIndex = true;
                }
                else
                {
                    m_lsra->clearOperandCounts(op2);
                    if (!op2->OperIsLeaf())
                    {
                        assert(nextChild == nullptr);
                        nextChild = op2;
                    }
                }
            }
        }
        assert(foundBase && foundIndex);
        info->srcCount--; // it gets incremented below.
    }
    else if (addr->gtOper == GT_ARR_ELEM)
    {
        // The GT_ARR_ELEM consumes all the indices and produces the offset.
        // The array object lives until the mem access.
        // We also consume the target register to which the address is
        // computed

        info->srcCount++;
        assert(addr->gtLsraInfo.srcCount >= 2);
        addr->gtLsraInfo.srcCount -= 1;
    }
    else
    {
        // it is nothing but a plain indir
        info->srcCount--; //base gets added in below
        base = addr;
    }

    if (base != nullptr)
    {
        info->srcCount++;
    }

    if (index != nullptr && !modifiedSources)
    {
        info->srcCount++;
    }
}
            

void Lowering::LowerCmp(GenTreePtr tree)
{
    TreeNodeInfo* info = &(tree->gtLsraInfo);
    
    info->srcCount = 2;
    info->dstCount = 1;

#ifdef _TARGET_X86_
    info->setDstCandidates(m_lsra, RBM_BYTE_REGS);
#endif // _TARGET_X86_

    GenTreePtr op1 = tree->gtOp.gtOp1;
    GenTreePtr op2 = tree->gtOp.gtOp2;
    var_types op1Type = op1->TypeGet();
    var_types op2Type = op2->TypeGet();

#if !defined(_TARGET_64BIT_)
    // Long compares will consume GT_LONG nodes, each of which produces two results.
    // Thus for each long operand there will be an additional source.
    if (varTypeIsLong(op1Type))
    {
        info->srcCount++;
    }
    if (varTypeIsLong(op2Type))
    {
        info->srcCount++;
    }
#endif // !defined(_TARGET_64BIT_)

    // If either of op1 or op2 is floating point values, then we need to use
    // ucomiss or ucomisd to compare, both of which support the following form
    // ucomis[s|d] xmm, xmm/mem.  That is only the second operand can be a memory
    // op.  
    //
    // Second operand is a memory Op:  Note that depending on comparison operator,
    // the operands of ucomis[s|d] need to be reversed.  Therefore, either op1 or 
    // op2 can be a memory op depending on the comparison operator.
    if (varTypeIsFloating(op1Type))
    {        
        // The type of the operands has to be the same and no implicit conversions at this stage.
        assert(op1Type == op2Type);

        bool reverseOps;
        if ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0)
        {
            // Unordered comparison case
            reverseOps = (tree->gtOper == GT_GT || tree->gtOper == GT_GE);
        }
        else
        {
            reverseOps = (tree->gtOper == GT_LT || tree->gtOper == GT_LE);
        }

        GenTreePtr otherOp;
        if (reverseOps)
        {
            otherOp = op1;
        }
        else
        {
            otherOp = op2;
        }

        assert(otherOp != nullptr);
        if (otherOp->IsCnsNonZeroFltOrDbl())
        {
            MakeSrcContained(tree, otherOp);
        }
        else if (otherOp->isMemoryOp())
        {
            if ((otherOp == op2) || IsSafeToContainMem(tree, otherOp)) 
            {
                MakeSrcContained(tree, otherOp);
            }
        }

        return;
    }

    // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
    // or in other backend.
    
    bool hasShortCast = false;
    if (CheckImmedAndMakeContained(tree, op2))
    {
        bool op1CanBeContained  = (op1Type == op2Type);
        if (!op1CanBeContained)
        {
            if (genTypeSize(op1Type) == genTypeSize(op2Type))          
            {
                // The constant is of the correct size, but we don't have an exact type match
                // We can treat the isMemoryOp as "contained"
                op1CanBeContained = true;
            }
        }

        // Do we have a short compare against a constant in op2
        //
        if (varTypeIsSmall(op1Type))
        {
            GenTreeIntCon* con  = op2->AsIntCon();
            ssize_t        ival = con->gtIconVal;

            bool    isEqualityCompare = (tree->gtOper == GT_EQ || tree->gtOper == GT_NE);
            bool    useTest           = isEqualityCompare && (ival == 0);

            if (!useTest)
            {
                ssize_t lo = 0;  // minimum imm value allowed for cmp reg,imm
                ssize_t hi = 0;  // maximum imm value allowed for cmp reg,imm
                bool    isUnsigned = false;

                switch (op1Type) {
                case TYP_BOOL:
                    op1Type = TYP_UBYTE;
                    __fallthrough;
                case TYP_UBYTE:
                    lo = 0;
                    hi = 0x7f;
                    isUnsigned = true;
                    break;
                case TYP_BYTE:
                    lo = -0x80;
                    hi =  0x7f;
                    break;
                case TYP_CHAR:
                    lo = 0;
                    hi = 0x7fff;
                    isUnsigned = true;
                    break;
                case TYP_SHORT:
                    lo = -0x8000;
                    hi =  0x7fff;
                    break;
                default:
                    unreached();
                }

                if ((ival >= lo) && (ival <= hi))
                {
                    // We can perform a small compare with the immediate 'ival'
                    tree->gtFlags |= GTF_RELOP_SMALL;
                    if (isUnsigned && !isEqualityCompare)
                    {
                        tree->gtFlags |= GTF_UNSIGNED;
                    }
                    // We can treat the isMemoryOp as "contained"
                    op1CanBeContained = true;
                }
            }
        }
        if (op1CanBeContained)
        {
            if (op1->isMemoryOp())
            {
                MakeSrcContained(tree, op1);
            }
            else 
            {
                // When op1 is a GT_AND we can often generate a single "test" instruction
                // instead of two instructions (an "and" instruction followed by a "cmp"/"test")
                //
                // This instruction can only be used for equality or inequality comparions.
                // and we must have a compare against zero.
                //
                // If we have a postive test for a single bit we can reverse the condition and 
                // make the compare be against zero
                //
                // Example:
                //                  GT_EQ                              GT_NE
                //                  /   \                              /   \
                //             GT_AND   GT_CNS (0x100)  ==>>      GT_AND   GT_CNS (0)
                //             /    \                             /    \
                //          andOp1  GT_CNS (0x100)             andOp1  GT_CNS (0x100)
                //
                // We will mark the GT_AND node as contained if the tree is a equality compare with zero
                // Additionally when we do this we also allow for a contained memory operand for "andOp1".
                //
                bool isEqualityCompare = (tree->gtOper == GT_EQ || tree->gtOper == GT_NE);

                if (isEqualityCompare && (op1->OperGet() == GT_AND))
                {
                    GenTreePtr andOp2 = op1->gtOp.gtOp2;
                    if (IsContainableImmed(op1, andOp2))
                    {
                        ssize_t andOp2CnsVal = andOp2->AsIntConCommon()->IconValue();
                        ssize_t relOp2CnsVal = op2->AsIntConCommon()->IconValue();

                        if ((relOp2CnsVal == andOp2CnsVal) && isPow2(andOp2CnsVal))
                        {
                            // We have a single bit test, so now we can change the
                            // tree into the alternative form, 
                            // so that we can generate a test instruction.

                            // Reverse the equality comparison
                            tree->gtOper = (tree->gtOper == GT_EQ) ? GT_NE : GT_EQ;

                            // Change the relOp2CnsVal to zero
                            relOp2CnsVal = 0;
                            op2->AsIntConCommon()->SetIconValue(0);
                        }

                        // Now do we have a equality compare with zero?
                        //
                        if (relOp2CnsVal == 0)
                        {
                            // Note that child nodes must be made contained before parent nodes

                            // Check for a memory operand for op1 with the test instruction
                            //
                            GenTreePtr andOp1 = op1->gtOp.gtOp1;
                            if (andOp1->isMemoryOp())
                            {
                                // Mark the 'andOp1' memory operand as contained
                                // Note that for equality comparisons we don't need
                                // to deal with any signed or unsigned issues.
                                MakeSrcContained(op1, andOp1);
                            }
                            // Mark the 'op1' (the GT_AND) operand as contained
                            MakeSrcContained(tree, op1);

                            // During Codegen we will now generate "test andOp1, andOp2CnsVal"
                        }
                    }
                }
                else if (op1->OperGet() == GT_CAST)
                {
                    //If the op1 is a cast operation, and cast type is one byte sized unsigned type, 
                    //we can directly use the number in register, instead of doing an extra cast step.
                    var_types   dstType       = op1->CastToType();
                    bool        isUnsignedDst = varTypeIsUnsigned(dstType);
                    emitAttr    castSize      = EA_ATTR(genTypeSize(dstType));
                    GenTreePtr  castOp1       = op1->gtOp.gtOp1;
                    genTreeOps  castOp1Oper   = castOp1->OperGet();
                    bool        safeOper      = false;

                    // It is not always safe to change the gtType of 'castOp1' to TYP_UBYTE
                    // For example when 'castOp1Oper' is a GT_RSZ or GT_RSH then we are shifting
                    // bits from the left into the lower bits.  If we change the type to a TYP_UBYTE
                    // we will instead generate a byte sized shift operation:  shr  al, 24
                    // For the following ALU operations is it safe to change the gtType to the
                    // smaller type:   
                    //
                    if ((castOp1Oper == GT_CNS_INT) || 
                        (castOp1Oper == GT_CALL)    ||    // the return value from a Call
                        (castOp1Oper == GT_LCL_VAR) ||
                        castOp1->OperIsLogical()    ||    // GT_AND, GT_OR, GT_XOR
                        castOp1->isMemoryOp()         )   // isIndir() || isLclField();
                    {
                        safeOper = true;
                    }

                    if ((castSize == EA_1BYTE) && isUnsignedDst &&    // Unsigned cast to TYP_UBYTE
                        safeOper &&                                   // Must be a safe operation
                        !op1->gtOverflow()                         )  // Must not be an overflow checking cast
                    {
                        // Currently all of the Oper accepted as 'safeOper' are 
                        // non-overflow checking operations.  If we were to add 
                        // an overflow checking operation then this assert needs 
                        // to be moved above to guard entry to this block.
                        // 
                        assert(!castOp1->gtOverflowEx());             // Must not be an overflow checking operation
                        
                        GenTreePtr removeTreeNode = op1;
                        GenTreePtr removeTreeNodeChild = castOp1;
                        tree->gtOp.gtOp1 = castOp1;
                        castOp1->gtType = TYP_UBYTE;

                        // trim down the value if castOp1 is an int constant since its type changed to UBYTE.
                        if (castOp1Oper == GT_CNS_INT)
                        {                            
                            castOp1->gtIntCon.gtIconVal = (UINT8)castOp1->gtIntCon.gtIconVal;
                        }

                        if (op2->isContainedIntOrIImmed())
                        {
                            ssize_t val = (ssize_t)op2->AsIntConCommon()->IconValue();
                            if (val >= 0 && val <= 255)
                            {
                                op2->gtType    = TYP_UBYTE;
                                tree->gtFlags |= GTF_UNSIGNED;
                                
                                //right now the op1's type is the same as op2's type.
                                //if op1 is MemoryOp, we should make the op1 as contained node.
                                if (castOp1->isMemoryOp())
                                {
                                    MakeSrcContained(tree, op1);
                                }
                            }
                        }
                        comp->fgSnipNode(comp->compCurStmt->AsStmt(), removeTreeNode);
#ifdef DEBUG
                        if (comp->verbose)
                        {
                            printf("LowerCmp: Removing a GT_CAST to TYP_UBYTE and changing castOp1->gtType to TYP_UBYTE\n");
                            comp->gtDispTree(tree);
                        }
#endif
                    }
                }
            }
        }
    }
    else if (op2->isMemoryOp())
    {
        if (op1Type == op2Type)
        {
            MakeSrcContained(tree, op2);

            // Mark the tree as doing unsigned comparison if
            // both the operands are small and unsigned types.
            // Otherwise we will end up performing a signed comparison
            // of two small unsigned values without zero extending them to
            // TYP_INT size and which is incorrect.
            if (varTypeIsSmall(op1Type) && varTypeIsUnsigned(op1Type))
            {
                tree->gtFlags |= GTF_UNSIGNED;
            }
        }
    }
    else if (op1->isMemoryOp()) 
    {
        if ((op1Type == op2Type) && IsSafeToContainMem(tree, op1))
        {
            MakeSrcContained(tree, op1);

            // Mark the tree as doing unsigned comparison if
            // both the operands are small and unsigned types.
            // Otherwise we will end up performing a signed comparison
            // of two small unsigned values without zero extending them to
            // TYP_INT size and which is incorrect.
            if (varTypeIsSmall(op1Type) && varTypeIsUnsigned(op1Type))
            {
                tree->gtFlags |= GTF_UNSIGNED;
            }
        }
    }
}

/* Lower GT_CAST(srcType, DstType) nodes. 
 *
 * Casts from small int type to float/double are transformed as follows:
 * GT_CAST(byte, float/double)     =   GT_CAST(GT_CAST(byte, int32), float/double)
 * GT_CAST(sbyte, float/double)    =   GT_CAST(GT_CAST(sbyte, int32), float/double)
 * GT_CAST(int16, float/double)    =   GT_CAST(GT_CAST(int16, int32), float/double)
 * GT_CAST(uint16, float/double)   =   GT_CAST(GT_CAST(uint16, int32), float/double)
 *
 * SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64 
 * are morphed as follows by front-end and hence should not be seen here.
 * GT_CAST(uint32, float/double)   =   GT_CAST(GT_CAST(uint32, long), float/double)
 * GT_CAST(uint64, float)          =   GT_CAST(GT_CAST(uint64, double), float)
 *
 *
 * Similarly casts from float/double to a smaller int type are transformed as follows:
 * GT_CAST(float/double, byte)     =   GT_CAST(GT_CAST(float/double, int32), byte)
 * GT_CAST(float/double, sbyte)    =   GT_CAST(GT_CAST(float/double, int32), sbyte)
 * GT_CAST(float/double, int16)    =   GT_CAST(GT_CAST(double/double, int32), int16)
 * GT_CAST(float/double, uint16)   =   GT_CAST(GT_CAST(double/double, int32), uint16)
 *
 * SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
 * integer.  The above transformations help us to leverage those instructions.
 * 
 * Note that for the following conversions we still depend on helper calls and
 * don't expect to see them here. 
 *  i) GT_CAST(float/double, uint64)
 * ii) GT_CAST(float/double, int type with overflow detection) 
 *
 * TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
 * There are hardly any occurrences of this conversion operation in platform
 * assemblies or in CQ perf benchmarks (1 occurrence in mscorlib, microsoft.jscript,
 * 1 occurence in Roslyn and no occurrences in system, system.core, system.numerics
 * system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
 * doing this optimization is a win, should consider generating in-lined code.
 */
void Lowering::LowerCast( GenTreePtr* ppTree) 
{
    GenTreePtr  tree    = *ppTree;
    assert(tree->OperGet() == GT_CAST);

    GenTreePtr  op1     = tree->gtOp.gtOp1;
    var_types   dstType = tree->CastToType();
    var_types   srcType = op1->TypeGet();
    var_types   tmpType = TYP_UNDEF;
    bool        srcUns  = false;

    // force the srcType to unsigned if GT_UNSIGNED flag is set
    if (tree->gtFlags & GTF_UNSIGNED)
    {
        srcType = genUnsignedType(srcType);
    }

    // We should never see the following casts as they are expected to be lowered 
    // apropriately or converted into helper calls by front-end.
    //   srcType = float/double                    dstType = * and overflow detecting cast
    //       Reason: must be converted to a helper call
    //   srcType = float/double,                   dstType = ulong
    //       Reason: must be converted to a helper call
    //   srcType = uint                            dstType = float/double
    //       Reason: uint -> float/double = uint -> long -> float/double
    //   srcType = ulong                           dstType = float
    //       Reason: ulong -> float = ulong -> double -> float
    if (varTypeIsFloating(srcType))
    {
        noway_assert(!tree->gtOverflow());
        noway_assert(dstType != TYP_ULONG);
    }
    else if (srcType == TYP_UINT)
    {
        noway_assert(!varTypeIsFloating(dstType));
    }
    else if (srcType == TYP_ULONG)
    {
        noway_assert(dstType != TYP_FLOAT);
    }

    // Case of src is a small type and dst is a floating point type.
    if (varTypeIsSmall(srcType) && varTypeIsFloating(dstType))
    {
        // These conversions can never be overflow detecting ones.
        noway_assert(!tree->gtOverflow());
        tmpType = TYP_INT;
    }
    // case of src is a floating point type and dst is a small type.
    else if (varTypeIsFloating(srcType) && varTypeIsSmall(dstType))
    {
        tmpType = TYP_INT;
    }

    if (tmpType != TYP_UNDEF)
    {
        GenTreePtr tmp = comp->gtNewCastNode(tmpType, op1, tmpType);
        tmp->gtFlags |= (tree->gtFlags & (GTF_UNSIGNED|GTF_OVERFLOW|GTF_EXCEPT));

        tree->gtFlags &= ~GTF_UNSIGNED;
        tree->gtOp.gtOp1 = tmp;
        op1->InsertAfterSelf(tmp);
    }
}

/** Lower StoreInd takes care of recognizing the cases where we have a treeNode with the following
 * structure:
 * storeInd(gtInd(subTreeA), binOp(gtInd(subTreeA), subtreeB) or
 * storeInd(gtInd(subTreeA), binOp(subtreeB, gtInd(subTreeA)) for the case of commutative 
 * operations.
 *
 * In x86/x64 this storeInd pattern can be effectively encoded in a single instruction of the 
 * form in case of integer operations:
 * binOp [addressing mode], regSubTreeB
 * where regSubTreeB is the register where subTreeB was computed.
 *
 * If the recognition is successful, we mark all the nodes under the storeInd node as contained so codeGen 
 * will generate the single instruction discussed above.
 *
 * Right now, we recognize few cases:
 *     a) The gtIndir child is a lclVar
 *     b) A constant 
 *     c) An lea.
 *     d) BinOp is either add, sub, xor, or, and, shl, rsh, rsz.
 *     
 *  TODO-CQ: Enable support for more complex indirections (if needed) or use the value numbering
 *  package to perform more complex tree recognition.
 *
 *  TODO-XArch-CQ: Add support for RMW of lcl fields (e.g. lclfield binop= source)
 *
 * Return value:  In case we recognize the tree pattern, we return true to specify lower we're
 *                finished and no further code needs to be run in order to lower this type of node.
 */
bool Lowering::LowerStoreInd(GenTreePtr tree)
{
    assert(tree->OperGet() == GT_STOREIND);

    // SSE2 doesn't support RMW operations on float/double types.
    assert(!varTypeIsFloating(tree));

    GenTreePtr indirDst = tree->gtGetOp1();
    GenTreePtr indirSrc = tree->gtGetOp2();

    const genTreeOps oper = indirSrc->OperGet();

    if (indirDst->OperGet() != GT_LEA &&
        indirDst->OperGet() != GT_LCL_VAR &&
        indirDst->OperGet() != GT_LCL_VAR_ADDR &&
        indirDst->OperGet() != GT_CLS_VAR_ADDR)
    {
        JITDUMP("Lower of StoreInd didn't mark the node as self contained\n");
        JITDUMP("because the type of indirection in the left hand side \n");
        JITDUMP("is not yet supported:\n");
        DISPTREE(indirDst);
        return false;
    }

    if (GenTree::OperIsBinary(oper))
    {
        if (indirSrc->gtOverflowEx())
        {
            // We can not use Read-Modify-Write instruction forms with overflow checking instructions
            // because we are not allowed to modify the target until after the overflow check.
            // 
            JITDUMP("Lower of StoreInd cannot lower overflow checking instructions into RMW forms\n");
            DISPTREE(indirDst);
            return false;
        }

        if (oper != GT_ADD &&
            oper != GT_SUB &&
            oper != GT_AND &&
            oper != GT_OR  &&
            oper != GT_XOR &&
            oper != GT_LSH &&
            oper != GT_RSH &&
            oper != GT_RSZ &&
            oper != GT_ROL &&
            oper != GT_ROR)
        {
            JITDUMP("Lower of StoreInd didn't mark the node as self contained\n");
            JITDUMP("because the node operator not yet supported:\n");
            DISPTREE(indirSrc);
            return false;
        }

        if ((oper == GT_LSH ||
             oper == GT_RSH ||
             oper == GT_RSZ ||
             oper == GT_ROL ||
             oper == GT_ROR) &&
            varTypeIsSmall(tree))
        {
            //In ldind, Integer values smaller than 4 bytes, a boolean, or a character converted to 4 bytes by sign or zero-extension as appropriate.
            //If directly shift the short type data using sar, we will lose the sign or zero-extension bits. This will generate the wrong code.
            return false;
        }

        GenTreePtr rhsLeft = indirSrc->gtGetOp1();
        GenTreePtr rhsRight = indirSrc->gtGetOp2();

        GenTreePtr indirCandidate = nullptr;
        GenTreePtr indirOpSource = nullptr;

        if (rhsLeft->OperGet() == GT_IND &&
            rhsLeft->gtGetOp1()->OperGet() == indirDst->OperGet() &&
            IsSafeToContainMem(indirSrc, rhsLeft))
        {
            indirCandidate = rhsLeft;
            indirOpSource = rhsRight;
        } 
        else if (GenTree::OperIsCommutative(oper) &&
                 rhsRight->OperGet() == GT_IND &&
                 rhsRight->gtGetOp1()->OperGet() == indirDst->OperGet())
        {
            indirCandidate = rhsRight;
            indirOpSource = rhsLeft;
        }

        if (indirCandidate == nullptr &&
            indirOpSource == nullptr)
        {
            JITDUMP("Lower of StoreInd didn't mark the node as self contained\n");
            JITDUMP("because the indirections don't match or the operator is not commutative\n");
            DISPTREE(tree);
            return false;
        }

        if (IndirsAreEquivalent(indirCandidate, tree))
        {
            JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n");
            tree->gtLsraInfo.srcCount = indirOpSource->gtLsraInfo.dstCount;
            SetStoreIndOpCounts(tree, indirCandidate);
            return true;
        }
        else 
        {
            JITDUMP("Lower of StoreInd didn't mark the node as self contained\n");
            JITDUMP("because the indirections are not equivalent.\n");
            DISPTREE(tree);
            return false;
        }
    } 
    else if (GenTree::OperIsUnary(oper))
    {
        // Nodes other than GT_NOT and GT_NEG are not yet supported
        // so we bail for now.
        if (oper != GT_NOT && oper != GT_NEG)
            return false;

        // If the operand of the GT_NOT | GT_NEG is not an indirection,
        // then this is not a RMW pattern.
        if (indirSrc->gtGetOp1()->OperGet() != GT_IND)
            return false;

        // We have a GT_IND below the NOT/NEG, so we attempt to recognize
        // the RMW pattern.
        GenTreePtr indirCandidate = indirSrc->gtGetOp1();
        if (IndirsAreEquivalent(indirCandidate, tree))
        {
            JITDUMP("Lower succesfully detected an assignment of the form: *addrMode = UnaryOp(*addrMode)\n");
            tree->gtLsraInfo.srcCount = 0;
            SetStoreIndOpCounts(tree, indirCandidate);
            return true;
        }
        else 
        {
            JITDUMP("Lower of StoreInd didn't mark the node as self contained\n");
            JITDUMP("because the indirections are not equivalent.\n");
            DISPTREE(tree);
            return false;
        }
    }
    else
    {
        JITDUMP("Lower of StoreInd didn't mark the node as self contained\n");
        JITDUMP("because the operator on the right hand side of the indirection is not\n");
        JITDUMP("a binary or unary operator.\n");
        DISPTREE(tree);
        return false;
    }
}

void Lowering::LowerRotate(GenTreePtr tree)
{
    // xarch supports both ROL and ROR instructions so no lowering is required.
}

void Lowering::SetStoreIndOpCounts(GenTreePtr storeInd, GenTreePtr indirCandidate)
{
    GenTreePtr indirDst = storeInd->gtGetOp1();
    GenTreePtr indirSrc = storeInd->gtGetOp2();
    TreeNodeInfo* info = &(storeInd->gtLsraInfo);

    info->dstCount = 0;

    m_lsra->clearOperandCounts(indirSrc);
    m_lsra->clearOperandCounts(indirCandidate);
    GenTreePtr indirCandidateChild = indirCandidate->gtGetOp1();
    if (indirCandidateChild->OperGet() == GT_LEA)
    {
        GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode();

        if (addrMode->HasBase())
        {
            assert(addrMode->Base()->OperIsLeaf());
            m_lsra->clearOperandCounts(addrMode->Base());
            info->srcCount++;
        }

        if (addrMode->HasIndex())
        {
            assert(addrMode->Index()->OperIsLeaf());
            m_lsra->clearOperandCounts(addrMode->Index());
            info->srcCount++;
        }

        m_lsra->clearOperandCounts(indirDst);
    }
    else 
    {
        assert(indirCandidateChild->OperGet() == GT_LCL_VAR || indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR);
        info->srcCount += indirCandidateChild->gtLsraInfo.dstCount;
        // If it is a GT_LCL_VAR, it still needs the reg to hold the address. 
        // However for GT_CLS_VAR_ADDR, we don't need that reg to hold the address, because field address value is known at this time.
        if(indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR)
        {
            m_lsra->clearOperandCounts(indirDst);
        }
    }
    m_lsra->clearOperandCounts(indirCandidateChild);
}

/**
 * Takes care of annotating the src and dst register 
 * requirements for a GT_MUL treenode.
 */
void Lowering::SetMulOpCounts(GenTreePtr tree)
{
    assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI);

    TreeNodeInfo* info = &(tree->gtLsraInfo);

    info->srcCount = 2;
    info->dstCount = 1;

    GenTreePtr op1 = tree->gtOp.gtOp1;
    GenTreePtr op2 = tree->gtOp.gtOp2;

    // Case of float/double mul.
    if (varTypeIsFloating(tree->TypeGet()))
    {
        if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
        {
            MakeSrcContained(tree, op2);
        }
        return;
    }
    
    bool isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
    bool requiresOverflowCheck = tree->gtOverflowEx();
    bool useLeaEncoding = false;
    GenTreePtr memOp = nullptr;

    // There are three forms of x86 multiply:
    // one-op form:     RDX:RAX = RAX * r/m
    // two-op form:     reg *= r/m
    // three-op form:   reg = r/m * imm

    // This special widening 32x32->64 MUL is not used on x64
    assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);

    // Multiply should never be using small types
    assert(!varTypeIsSmall(tree->TypeGet()));

    // We do use the widening multiply to implement 
    // the overflow checking for unsigned multiply
    // 
    if (isUnsignedMultiply && requiresOverflowCheck)
    {
        // The only encoding provided is RDX:RAX = RAX * rm
        // 
        // Here we set RAX as the only destination candidate 
        // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
        //
        info->setDstCandidates(m_lsra,RBM_RAX);
    }
    else if (tree->gtOper == GT_MULHI)
    {
        // have to use the encoding:RDX:RAX = RAX * rm
        info->setDstCandidates(m_lsra, RBM_RAX);
    }
    else if (IsContainableImmed(tree, op2) || IsContainableImmed(tree, op1))
    {
        GenTreeIntConCommon* imm;
        GenTreePtr other;

        if (IsContainableImmed(tree, op2))
        { 
            imm = op2->AsIntConCommon();
            other = op1; 
        }
        else
        { 
            imm = op1->AsIntConCommon();
            other = op2; 
        }

        // CQ: We want to rewrite this into a LEA
        ssize_t immVal = imm->AsIntConCommon()->IconValue();
        if (!requiresOverflowCheck && (immVal == 3 || immVal == 5 || immVal == 9))
        {
            useLeaEncoding = true;
        }

        MakeSrcContained(tree, imm);   // The imm is always contained
        if (other->isIndir())
        {
            memOp = other;             // memOp may be contained below
        }
    }
    // We allow one operand to be a contained memory operand.
    // The memory op type must match with the 'tree' type.
    // This is because during codegen we use 'tree' type to derive EmitTypeSize.
    // E.g op1 type = byte, op2 type = byte but GT_MUL tree type is int.
    //
    if (memOp == nullptr && op2->isMemoryOp())
    {
        memOp = op2;
    }

    // To generate an LEA we need to force memOp into a register
    // so don't allow memOp to be 'contained'
    //
    if ((memOp != nullptr)                    &&
        !useLeaEncoding                       &&
        (memOp->TypeGet() == tree->TypeGet()) &&
        IsSafeToContainMem(tree, memOp))
    {
        MakeSrcContained(tree, memOp);
    }
}

//------------------------------------------------------------------------------
// isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
//
// Arguments:
//    tree      - a binary tree node
//
// Return Value:
//    Returns true if we can use the read-modify-write instruction form
//
// Notes:
//    This is used to determine whether to preference the source to the destination register.
//
bool Lowering::isRMWRegOper(GenTreePtr tree)
{
    // TODO-XArch-CQ: Make this more accurate.
    // For now, We assume that most binary operators are of the RMW form.
    assert(tree->OperIsBinary());

    if (tree->OperIsCompare())
    {
        return false;
    }

    // These Opers either support a three op form (i.e. GT_LEA), or do not read/write their first operand 
    if ((tree->OperGet() == GT_LEA) || (tree->OperGet() == GT_STOREIND) || (tree->OperGet() == GT_ARR_INDEX))
        return false;

    // x86/x64 does support a three op multiply when op2|op1 is a contained immediate
    if ((tree->OperGet() == GT_MUL) &&
        (Lowering::IsContainableImmed(tree, tree->gtOp.gtOp2) ||
        Lowering::IsContainableImmed(tree, tree->gtOp.gtOp1)))
    {
        return false;
    }

    // otherwise we return true.
    return true;
}

// anything is in range for AMD64
bool Lowering::IsCallTargetInRange(void* addr)
{
    return true;
}

// return true if the immediate can be folded into an instruction, for example small enough and non-relocatable
bool Lowering:: IsContainableImmed(GenTree* parentNode, GenTree* childNode)
{
    if (!childNode->IsIntCnsFitsInI32())
        return false;
    if (childNode->IsIconHandle() && comp->opts.compReloc)
        return false;

    return true;
}

#endif // _TARGET_XARCH_

#endif // !LEGACY_BACKEND