1 files changed, 9388 insertions, 0 deletions
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
new file mode 100644
index 0000000000..a41c28695b
--- /dev/null
+++ b/src/jit/codegenxarch.cpp
@@ -0,0 +1,9388 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                        Amd64/x86 Code Generator                           XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator.
+
+#ifdef _TARGET_XARCH_
+#include "emit.h"
+#include "codegen.h"
+#include "lower.h"
+#include "gcinfo.h"
+#include "gcinfoencoder.h"
+
+// Get the register assigned to the given node
+
+regNumber CodeGenInterface::genGetAssignedReg(GenTreePtr tree)
+{
+    return tree->gtRegNum;
+}
+
+//------------------------------------------------------------------------
+// genSpillVar: Spill a local variable
+//
+// Arguments:
+//    tree      - the lclVar node for the variable being spilled
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    The lclVar must be a register candidate (lvRegCandidate)
+
+void CodeGen::genSpillVar(GenTreePtr tree)
+{
+    unsigned   varNum = tree->gtLclVarCommon.gtLclNum;
+    LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
+
+    assert(varDsc->lvIsRegCandidate());
+
+    // We don't actually need to spill if it is already living in memory
+    bool needsSpill = ((tree->gtFlags & GTF_VAR_DEF) == 0 && varDsc->lvIsInReg());
+    if (needsSpill)
+    {
+        var_types lclTyp = varDsc->TypeGet();
+        if (varDsc->lvNormalizeOnStore())
+        {
+            lclTyp = genActualType(lclTyp);
+        }
+        emitAttr size = emitTypeSize(lclTyp);
+
+        bool restoreRegVar = false;
+        if (tree->gtOper == GT_REG_VAR)
+        {
+            tree->SetOper(GT_LCL_VAR);
+            restoreRegVar = true;
+        }
+
+        // mask off the flag to generate the right spill code, then bring it back
+        tree->gtFlags &= ~GTF_REG_VAL;
+
+        instruction storeIns = ins_Store(tree->TypeGet(), compiler->isSIMDTypeLocalAligned(varNum));
+#if CPU_LONG_USES_REGPAIR
+        if (varTypeIsMultiReg(tree))
+        {
+            assert(varDsc->lvRegNum == genRegPairLo(tree->gtRegPair));
+            assert(varDsc->lvOtherReg == genRegPairHi(tree->gtRegPair));
+            regNumber regLo = genRegPairLo(tree->gtRegPair);
+            regNumber regHi = genRegPairHi(tree->gtRegPair);
+            inst_TT_RV(storeIns, tree, regLo);
+            inst_TT_RV(storeIns, tree, regHi, 4);
+        }
+        else
+#endif
+        {
+            assert(varDsc->lvRegNum == tree->gtRegNum);
+            inst_TT_RV(storeIns, tree, tree->gtRegNum, 0, size);
+        }
+        tree->gtFlags |= GTF_REG_VAL;
+
+        if (restoreRegVar)
+        {
+            tree->SetOper(GT_REG_VAR);
+        }
+
+        genUpdateRegLife(varDsc, /*isBorn*/ false, /*isDying*/ true DEBUGARG(tree));
+        gcInfo.gcMarkRegSetNpt(varDsc->lvRegMask());
+
+        if (VarSetOps::IsMember(compiler, gcInfo.gcTrkStkPtrLcls, varDsc->lvVarIndex))
+        {
+#ifdef DEBUG
+            if (!VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
+            {
+                JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming live\n", varNum);
+            }
+            else
+            {
+                JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing live\n", varNum);
+            }
+#endif
+            VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+        }
+    }
+
+    tree->gtFlags &= ~GTF_SPILL;
+    varDsc->lvRegNum = REG_STK;
+    if (varTypeIsMultiReg(tree))
+    {
+        varDsc->lvOtherReg = REG_STK;
+    }
+}
+
+// inline
+void CodeGenInterface::genUpdateVarReg(LclVarDsc* varDsc, GenTreePtr tree)
+{
+    assert(tree->OperIsScalarLocal() || (tree->gtOper == GT_COPY));
+    varDsc->lvRegNum = tree->gtRegNum;
+}
+
+/*****************************************************************************/
+/*****************************************************************************/
+
+/*****************************************************************************
+ *
+ *  Generate code that will set the given register to the integer constant.
+ */
+
+void CodeGen::genSetRegToIcon(regNumber reg, ssize_t val, var_types type, insFlags flags)
+{
+    // Reg cannot be a FP reg
+    assert(!genIsValidFloatReg(reg));
+
+    // The only TYP_REF constant that can come this path is a managed 'null' since it is not
+    // relocatable.  Other ref type constants (e.g. string objects) go through a different
+    // code path.
+    noway_assert(type != TYP_REF || val == 0);
+
+    if (val == 0)
+    {
+        instGen_Set_Reg_To_Zero(emitActualTypeSize(type), reg, flags);
+    }
+    else
+    {
+        // TODO-XArch-CQ: needs all the optimized cases
+        getEmitter()->emitIns_R_I(INS_mov, emitActualTypeSize(type), reg, val);
+    }
+}
+
+/*****************************************************************************
+ *
+ *   Generate code to check that the GS cookie wasn't thrashed by a buffer
+ *   overrun.  If pushReg is true, preserve all registers around code sequence.
+ *   Otherwise ECX could be modified.
+ *
+ *   Implementation Note: pushReg = true, in case of tail calls.
+ */
+void CodeGen::genEmitGSCookieCheck(bool pushReg)
+{
+    noway_assert(compiler->gsGlobalSecurityCookieAddr || compiler->gsGlobalSecurityCookieVal);
+
+    // Make sure that EAX is reported as live GC-ref so that any GC that kicks in while
+    // executing GS cookie check will not collect the object pointed to by EAX.
+    //
+    // For Amd64 System V, a two-register-returned struct could be returned in RAX and RDX
+    // In such case make sure that the correct GC-ness of RDX is reported as well, so
+    // a GC object pointed by RDX will not be collected.
+    if (!pushReg)
+    {
+        // Handle multi-reg return type values
+        if (compiler->compMethodReturnsMultiRegRetType())
+        {
+            ReturnTypeDesc retTypeDesc;
+            if (varTypeIsLong(compiler->info.compRetNativeType))
+            {
+                retTypeDesc.InitializeLongReturnType(compiler);
+            }
+            else // we must have a struct return type
+            {
+                retTypeDesc.InitializeStructReturnType(compiler, compiler->info.compMethodInfo->args.retTypeClass);
+            }
+
+            unsigned regCount = retTypeDesc.GetReturnRegCount();
+
+            // Only x86 and x64 Unix ABI allows multi-reg return and
+            // number of result regs should be equal to MAX_RET_REG_COUNT.
+            assert(regCount == MAX_RET_REG_COUNT);
+
+            for (unsigned i = 0; i < regCount; ++i)
+            {
+                gcInfo.gcMarkRegPtrVal(retTypeDesc.GetABIReturnReg(i), retTypeDesc.GetReturnRegType(i));
+            }
+        }
+        else if (compiler->compMethodReturnsRetBufAddr())
+        {
+            // This is for returning in an implicit RetBuf.
+            // If the address of the buffer is returned in REG_INTRET, mark the content of INTRET as ByRef.
+
+            // In case the return is in an implicit RetBuf, the native return type should be a struct
+            assert(varTypeIsStruct(compiler->info.compRetNativeType));
+
+            gcInfo.gcMarkRegPtrVal(REG_INTRET, TYP_BYREF);
+        }
+        // ... all other cases.
+        else
+        {
+#ifdef _TARGET_AMD64_
+            // For x64, structs that are not returned in registers are always
+            // returned in implicit RetBuf. If we reached here, we should not have
+            // a RetBuf and the return type should not be a struct.
+            assert(compiler->info.compRetBuffArg == BAD_VAR_NUM);
+            assert(!varTypeIsStruct(compiler->info.compRetNativeType));
+#endif // _TARGET_AMD64_
+
+            // For x86 Windows we can't make such assertions since we generate code for returning of
+            // the RetBuf in REG_INTRET only when the ProfilerHook is enabled. Otherwise
+            // compRetNativeType could be TYP_STRUCT.
+            gcInfo.gcMarkRegPtrVal(REG_INTRET, compiler->info.compRetNativeType);
+        }
+    }
+
+    regNumber regGSCheck;
+    if (!pushReg)
+    {
+        // Non-tail call: we can use any callee trash register that is not
+        // a return register or contain 'this' pointer (keep alive this), since
+        // we are generating GS cookie check after a GT_RETURN block.
+        // Note: On Amd64 System V RDX is an arg register - REG_ARG_2 - as well
+        // as return register for two-register-returned structs.
+        if (compiler->lvaKeepAliveAndReportThis() && compiler->lvaTable[compiler->info.compThisArg].lvRegister &&
+            (compiler->lvaTable[compiler->info.compThisArg].lvRegNum == REG_ARG_0))
+        {
+            regGSCheck = REG_ARG_1;
+        }
+        else
+        {
+            regGSCheck = REG_ARG_0;
+        }
+    }
+    else
+    {
+#ifdef _TARGET_X86_
+        NYI_X86("Tail calls from methods that need GS check");
+        regGSCheck = REG_NA;
+#else  // !_TARGET_X86_
+        // Tail calls from methods that need GS check:  We need to preserve registers while
+        // emitting GS cookie check for a tail prefixed call or a jmp. To emit GS cookie
+        // check, we might need a register. This won't be an issue for jmp calls for the
+        // reason mentioned below (see comment starting with "Jmp Calls:").
+        //
+        // The following are the possible solutions in case of tail prefixed calls:
+        // 1) Use R11 - ignore tail prefix on calls that need to pass a param in R11 when
+        //    present in methods that require GS cookie check.  Rest of the tail calls that
+        //    do not require R11 will be honored.
+        // 2) Internal register - GT_CALL node reserves an internal register and emits GS
+        //    cookie check as part of tail call codegen. GenExitCode() needs to special case
+        //    fast tail calls implemented as epilog+jmp or such tail calls should always get
+        //    dispatched via helper.
+        // 3) Materialize GS cookie check as a sperate node hanging off GT_CALL node in
+        //    right execution order during rationalization.
+        //
+        // There are two calls that use R11: VSD and calli pinvokes with cookie param. Tail
+        // prefix on pinvokes is ignored.  That is, options 2 and 3 will allow tail prefixed
+        // VSD calls from methods that need GS check.
+        //
+        // Tail prefixed calls: Right now for Jit64 compat, method requiring GS cookie check
+        // ignores tail prefix.  In future, if we intend to support tail calls from such a method,
+        // consider one of the options mentioned above.  For now adding an assert that we don't
+        // expect to see a tail call in a method that requires GS check.
+        noway_assert(!compiler->compTailCallUsed);
+
+        // Jmp calls: specify method handle using which JIT queries VM for its entry point
+        // address and hence it can neither be a VSD call nor PInvoke calli with cookie
+        // parameter.  Therefore, in case of jmp calls it is safe to use R11.
+        regGSCheck = REG_R11;
+#endif // !_TARGET_X86_
+    }
+
+    if (compiler->gsGlobalSecurityCookieAddr == nullptr)
+    {
+        // If GS cookie value fits within 32-bits we can use 'cmp mem64, imm32'.
+        // Otherwise, load the value into a reg and use 'cmp mem64, reg64'.
+        if ((int)compiler->gsGlobalSecurityCookieVal != (ssize_t)compiler->gsGlobalSecurityCookieVal)
+        {
+            genSetRegToIcon(regGSCheck, compiler->gsGlobalSecurityCookieVal, TYP_I_IMPL);
+            getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
+        }
+        else
+        {
+            getEmitter()->emitIns_S_I(INS_cmp, EA_PTRSIZE, compiler->lvaGSSecurityCookie, 0,
+                                      (int)compiler->gsGlobalSecurityCookieVal);
+        }
+    }
+    else
+    {
+        // Ngen case - GS cookie value needs to be accessed through an indirection.
+        instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, regGSCheck, (ssize_t)compiler->gsGlobalSecurityCookieAddr);
+        getEmitter()->emitIns_R_AR(ins_Load(TYP_I_IMPL), EA_PTRSIZE, regGSCheck, regGSCheck, 0);
+        getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
+    }
+
+    BasicBlock*  gsCheckBlk = genCreateTempLabel();
+    emitJumpKind jmpEqual   = genJumpKindForOper(GT_EQ, CK_SIGNED);
+    inst_JMP(jmpEqual, gsCheckBlk);
+    genEmitHelperCall(CORINFO_HELP_FAIL_FAST, 0, EA_UNKNOWN);
+    genDefineTempLabel(gsCheckBlk);
+}
+
+/*****************************************************************************
+ *
+ *  Generate code for all the basic blocks in the function.
+ */
+
+void CodeGen::genCodeForBBlist()
+{
+    unsigned   varNum;
+    LclVarDsc* varDsc;
+
+    unsigned savedStkLvl;
+
+#ifdef DEBUG
+    genInterruptibleUsed = true;
+
+    // You have to be careful if you create basic blocks from now on
+    compiler->fgSafeBasicBlockCreation = false;
+
+    // This stress mode is not comptible with fully interruptible GC
+    if (genInterruptible && compiler->opts.compStackCheckOnCall)
+    {
+        compiler->opts.compStackCheckOnCall = false;
+    }
+
+    // This stress mode is not comptible with fully interruptible GC
+    if (genInterruptible && compiler->opts.compStackCheckOnRet)
+    {
+        compiler->opts.compStackCheckOnRet = false;
+    }
+#endif // DEBUG
+
+    // Prepare the blocks for exception handling codegen: mark the blocks that needs labels.
+    genPrepForEHCodegen();
+
+    assert(!compiler->fgFirstBBScratch ||
+           compiler->fgFirstBB == compiler->fgFirstBBScratch); // compiler->fgFirstBBScratch has to be first.
+
+    /* Initialize the spill tracking logic */
+
+    regSet.rsSpillBeg();
+
+#ifdef DEBUGGING_SUPPORT
+    /* Initialize the line# tracking logic */
+
+    if (compiler->opts.compScopeInfo)
+    {
+        siInit();
+    }
+#endif
+
+    // The current implementation of switch tables requires the first block to have a label so it
+    // can generate offsets to the switch label targets.
+    // TODO-XArch-CQ: remove this when switches have been re-implemented to not use this.
+    if (compiler->fgHasSwitch)
+    {
+        compiler->fgFirstBB->bbFlags |= BBF_JMP_TARGET;
+    }
+
+    genPendingCallLabel = nullptr;
+
+    /* Initialize the pointer tracking code */
+
+    gcInfo.gcRegPtrSetInit();
+    gcInfo.gcVarPtrSetInit();
+
+    /* If any arguments live in registers, mark those regs as such */
+
+    for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->lvaCount; varNum++, varDsc++)
+    {
+        /* Is this variable a parameter assigned to a register? */
+
+        if (!varDsc->lvIsParam || !varDsc->lvRegister)
+        {
+            continue;
+        }
+
+        /* Is the argument live on entry to the method? */
+
+        if (!VarSetOps::IsMember(compiler, compiler->fgFirstBB->bbLiveIn, varDsc->lvVarIndex))
+        {
+            continue;
+        }
+
+        /* Is this a floating-point argument? */
+
+        if (varDsc->IsFloatRegType())
+        {
+            continue;
+        }
+
+        noway_assert(!varTypeIsFloating(varDsc->TypeGet()));
+
+        /* Mark the register as holding the variable */
+
+        regTracker.rsTrackRegLclVar(varDsc->lvRegNum, varNum);
+    }
+
+    unsigned finallyNesting = 0;
+
+    // Make sure a set is allocated for compiler->compCurLife (in the long case), so we can set it to empty without
+    // allocation at the start of each basic block.
+    VarSetOps::AssignNoCopy(compiler, compiler->compCurLife, VarSetOps::MakeEmpty(compiler));
+
+    /*-------------------------------------------------------------------------
+     *
+     *  Walk the basic blocks and generate code for each one
+     *
+     */
+
+    BasicBlock* block;
+    BasicBlock* lblk; /* previous block */
+
+    for (lblk = nullptr, block = compiler->fgFirstBB; block != nullptr; lblk = block, block = block->bbNext)
+    {
+#ifdef DEBUG
+        if (compiler->verbose)
+        {
+            printf("\n=============== Generating ");
+            block->dspBlockHeader(compiler, true, true);
+            compiler->fgDispBBLiveness(block);
+        }
+#endif // DEBUG
+
+        // Figure out which registers hold variables on entry to this block
+
+        regSet.ClearMaskVars();
+        gcInfo.gcRegGCrefSetCur = RBM_NONE;
+        gcInfo.gcRegByrefSetCur = RBM_NONE;
+
+        compiler->m_pLinearScan->recordVarLocationsAtStartOfBB(block);
+
+        genUpdateLife(block->bbLiveIn);
+
+        // Even if liveness didn't change, we need to update the registers containing GC references.
+        // genUpdateLife will update the registers live due to liveness changes. But what about registers that didn't
+        // change? We cleared them out above. Maybe we should just not clear them out, but update the ones that change
+        // here. That would require handling the changes in recordVarLocationsAtStartOfBB().
+
+        regMaskTP newLiveRegSet  = RBM_NONE;
+        regMaskTP newRegGCrefSet = RBM_NONE;
+        regMaskTP newRegByrefSet = RBM_NONE;
+#ifdef DEBUG
+        VARSET_TP VARSET_INIT_NOCOPY(removedGCVars, VarSetOps::MakeEmpty(compiler));
+        VARSET_TP VARSET_INIT_NOCOPY(addedGCVars, VarSetOps::MakeEmpty(compiler));
+#endif
+        VARSET_ITER_INIT(compiler, iter, block->bbLiveIn, varIndex);
+        while (iter.NextElem(compiler, &varIndex))
+        {
+            unsigned   varNum = compiler->lvaTrackedToVarNum[varIndex];
+            LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
+
+            if (varDsc->lvIsInReg())
+            {
+                newLiveRegSet |= varDsc->lvRegMask();
+                if (varDsc->lvType == TYP_REF)
+                {
+                    newRegGCrefSet |= varDsc->lvRegMask();
+                }
+                else if (varDsc->lvType == TYP_BYREF)
+                {
+                    newRegByrefSet |= varDsc->lvRegMask();
+                }
+#ifdef DEBUG
+                if (verbose && VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varIndex))
+                {
+                    VarSetOps::AddElemD(compiler, removedGCVars, varIndex);
+                }
+#endif // DEBUG
+                VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varIndex);
+            }
+            else if (compiler->lvaIsGCTracked(varDsc))
+            {
+#ifdef DEBUG
+                if (verbose && !VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varIndex))
+                {
+                    VarSetOps::AddElemD(compiler, addedGCVars, varIndex);
+                }
+#endif // DEBUG
+                VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varIndex);
+            }
+        }
+
+        regSet.rsMaskVars = newLiveRegSet;
+
+#ifdef DEBUG
+        if (compiler->verbose)
+        {
+            if (!VarSetOps::IsEmpty(compiler, addedGCVars))
+            {
+                printf("\t\t\t\t\t\t\tAdded GCVars: ");
+                dumpConvertedVarSet(compiler, addedGCVars);
+                printf("\n");
+            }
+            if (!VarSetOps::IsEmpty(compiler, removedGCVars))
+            {
+                printf("\t\t\t\t\t\t\tRemoved GCVars: ");
+                dumpConvertedVarSet(compiler, removedGCVars);
+                printf("\n");
+            }
+        }
+#endif // DEBUG
+
+        gcInfo.gcMarkRegSetGCref(newRegGCrefSet DEBUGARG(true));
+        gcInfo.gcMarkRegSetByref(newRegByrefSet DEBUGARG(true));
+
+        /* Blocks with handlerGetsXcptnObj()==true use GT_CATCH_ARG to
+           represent the exception object (TYP_REF).
+           We mark REG_EXCEPTION_OBJECT as holding a GC object on entry
+           to the block,  it will be the first thing evaluated
+           (thanks to GTF_ORDER_SIDEEFF).
+         */
+
+        if (handlerGetsXcptnObj(block->bbCatchTyp))
+        {
+            for (GenTree* node : LIR::AsRange(block))
+            {
+                if (node->OperGet() == GT_CATCH_ARG)
+                {
+                    gcInfo.gcMarkRegSetGCref(RBM_EXCEPTION_OBJECT);
+                    break;
+                }
+            }
+        }
+
+        /* Start a new code output block */
+
+        genUpdateCurrentFunclet(block);
+
+        if (genAlignLoops && block->bbFlags & BBF_LOOP_HEAD)
+        {
+            getEmitter()->emitLoopAlign();
+        }
+
+#ifdef DEBUG
+        if (compiler->opts.dspCode)
+        {
+            printf("\n      L_M%03u_BB%02u:\n", Compiler::s_compMethodsCount, block->bbNum);
+        }
+#endif
+
+        block->bbEmitCookie = nullptr;
+
+        if (block->bbFlags & (BBF_JMP_TARGET | BBF_HAS_LABEL))
+        {
+            /* Mark a label and update the current set of live GC refs */
+
+            block->bbEmitCookie = getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur,
+                                                             gcInfo.gcRegByrefSetCur, FALSE);
+        }
+
+        if (block == compiler->fgFirstColdBlock)
+        {
+#ifdef DEBUG
+            if (compiler->verbose)
+            {
+                printf("\nThis is the start of the cold region of the method\n");
+            }
+#endif
+            // We should never have a block that falls through into the Cold section
+            noway_assert(!lblk->bbFallsThrough());
+
+            // We require the block that starts the Cold section to have a label
+            noway_assert(block->bbEmitCookie);
+            getEmitter()->emitSetFirstColdIGCookie(block->bbEmitCookie);
+        }
+
+        /* Both stacks are always empty on entry to a basic block */
+
+        genStackLevel = 0;
+
+        savedStkLvl = genStackLevel;
+
+        /* Tell everyone which basic block we're working on */
+
+        compiler->compCurBB = block;
+
+#ifdef DEBUGGING_SUPPORT
+        siBeginBlock(block);
+
+        // BBF_INTERNAL blocks don't correspond to any single IL instruction.
+        if (compiler->opts.compDbgInfo && (block->bbFlags & BBF_INTERNAL) &&
+            !compiler->fgBBisScratch(block)) // If the block is the distinguished first scratch block, then no need to
+                                             // emit a NO_MAPPING entry, immediately after the prolog.
+        {
+            genIPmappingAdd((IL_OFFSETX)ICorDebugInfo::NO_MAPPING, true);
+        }
+
+        bool firstMapping = true;
+#endif // DEBUGGING_SUPPORT
+
+        /*---------------------------------------------------------------------
+         *
+         *  Generate code for each statement-tree in the block
+         *
+         */
+        CLANG_FORMAT_COMMENT_ANCHOR;
+
+#if FEATURE_EH_FUNCLETS
+        if (block->bbFlags & BBF_FUNCLET_BEG)
+        {
+            genReserveFuncletProlog(block);
+        }
+#endif // FEATURE_EH_FUNCLETS
+
+        // Clear compCurStmt and compCurLifeTree.
+        compiler->compCurStmt     = nullptr;
+        compiler->compCurLifeTree = nullptr;
+
+        // Traverse the block in linear order, generating code for each node as we
+        // as we encounter it.
+        CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef DEBUGGING_SUPPORT
+        IL_OFFSETX currentILOffset = BAD_IL_OFFSET;
+#endif
+        for (GenTree* node : LIR::AsRange(block).NonPhiNodes())
+        {
+#ifdef DEBUGGING_SUPPORT
+            // Do we have a new IL offset?
+            if (node->OperGet() == GT_IL_OFFSET)
+            {
+                genEnsureCodeEmitted(currentILOffset);
+                currentILOffset = node->gtStmt.gtStmtILoffsx;
+                genIPmappingAdd(currentILOffset, firstMapping);
+                firstMapping = false;
+            }
+#endif // DEBUGGING_SUPPORT
+
+#ifdef DEBUG
+            if (node->OperGet() == GT_IL_OFFSET)
+            {
+                noway_assert(node->gtStmt.gtStmtLastILoffs <= compiler->info.compILCodeSize ||
+                             node->gtStmt.gtStmtLastILoffs == BAD_IL_OFFSET);
+
+                if (compiler->opts.dspCode && compiler->opts.dspInstrs &&
+                    node->gtStmt.gtStmtLastILoffs != BAD_IL_OFFSET)
+                {
+                    while (genCurDispOffset <= node->gtStmt.gtStmtLastILoffs)
+                    {
+                        genCurDispOffset += dumpSingleInstr(compiler->info.compCode, genCurDispOffset, ">    ");
+                    }
+                }
+            }
+#endif // DEBUG
+
+            genCodeForTreeNode(node);
+            if (node->gtHasReg() && node->gtLsraInfo.isLocalDefUse)
+            {
+                genConsumeReg(node);
+            }
+        } // end for each node in block
+
+#ifdef DEBUG
+        // The following set of register spill checks and GC pointer tracking checks used to be
+        // performed at statement boundaries. Now, with LIR, there are no statements, so they are
+        // performed at the end of each block.
+        // TODO: could these checks be performed more frequently? E.g., at each location where
+        // the register allocator says there are no live non-variable registers. Perhaps this could
+        // be done by (a) keeping a running count of live non-variable registers by using
+        // gtLsraInfo.srcCount and gtLsraInfo.dstCount to decrement and increment the count, respectively,
+        // and running the checks when the count is zero. Or, (b) use the map maintained by LSRA
+        // (operandToLocationInfoMap) to mark a node somehow when, after the execution of that node,
+        // there will be no live non-variable registers.
+
+        regSet.rsSpillChk();
+
+        /* Make sure we didn't bungle pointer register tracking */
+
+        regMaskTP ptrRegs       = gcInfo.gcRegGCrefSetCur | gcInfo.gcRegByrefSetCur;
+        regMaskTP nonVarPtrRegs = ptrRegs & ~regSet.rsMaskVars;
+
+        // If return is a GC-type, clear it.  Note that if a common
+        // epilog is generated (genReturnBB) it has a void return
+        // even though we might return a ref.  We can't use the compRetType
+        // as the determiner because something we are tracking as a byref
+        // might be used as a return value of a int function (which is legal)
+        GenTree* blockLastNode = block->lastNode();
+        if ((blockLastNode != nullptr) && (blockLastNode->gtOper == GT_RETURN) &&
+            (varTypeIsGC(compiler->info.compRetType) ||
+             (blockLastNode->gtOp.gtOp1 != nullptr && varTypeIsGC(blockLastNode->gtOp.gtOp1->TypeGet()))))
+        {
+            nonVarPtrRegs &= ~RBM_INTRET;
+        }
+
+        if (nonVarPtrRegs)
+        {
+            printf("Regset after BB%02u gcr=", block->bbNum);
+            printRegMaskInt(gcInfo.gcRegGCrefSetCur & ~regSet.rsMaskVars);
+            compiler->getEmitter()->emitDispRegSet(gcInfo.gcRegGCrefSetCur & ~regSet.rsMaskVars);
+            printf(", byr=");
+            printRegMaskInt(gcInfo.gcRegByrefSetCur & ~regSet.rsMaskVars);
+            compiler->getEmitter()->emitDispRegSet(gcInfo.gcRegByrefSetCur & ~regSet.rsMaskVars);
+            printf(", regVars=");
+            printRegMaskInt(regSet.rsMaskVars);
+            compiler->getEmitter()->emitDispRegSet(regSet.rsMaskVars);
+            printf("\n");
+        }
+
+        noway_assert(nonVarPtrRegs == RBM_NONE);
+#endif // DEBUG
+
+#if defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_AMD64_)
+        if (block->bbNext == nullptr)
+        {
+            // Unit testing of the AMD64 emitter: generate a bunch of instructions into the last block
+            // (it's as good as any, but better than the prolog, which can only be a single instruction
+            // group) then use COMPlus_JitLateDisasm=* to see if the late disassembler
+            // thinks the instructions are the same as we do.
+            genAmd64EmitterUnitTests();
+        }
+#endif // defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_ARM64_)
+
+#ifdef DEBUGGING_SUPPORT
+        // It is possible to reach the end of the block without generating code for the current IL offset.
+        // For example, if the following IR ends the current block, no code will have been generated for
+        // offset 21:
+        //
+        //          (  0,  0) [000040] ------------                il_offset void   IL offset: 21
+        //
+        //     N001 (  0,  0) [000039] ------------                nop       void
+        //
+        // This can lead to problems when debugging the generated code. To prevent these issues, make sure
+        // we've generated code for the last IL offset we saw in the block.
+        genEnsureCodeEmitted(currentILOffset);
+
+        if (compiler->opts.compScopeInfo && (compiler->info.compVarScopesCount > 0))
+        {
+            siEndBlock(block);
+
+            /* Is this the last block, and are there any open scopes left ? */
+
+            bool isLastBlockProcessed = (block->bbNext == nullptr);
+            if (block->isBBCallAlwaysPair())
+            {
+                isLastBlockProcessed = (block->bbNext->bbNext == nullptr);
+            }
+
+            if (isLastBlockProcessed && siOpenScopeList.scNext)
+            {
+                /* This assert no longer holds, because we may insert a throw
+                   block to demarcate the end of a try or finally region when they
+                   are at the end of the method.  It would be nice if we could fix
+                   our code so that this throw block will no longer be necessary. */
+
+                // noway_assert(block->bbCodeOffsEnd != compiler->info.compILCodeSize);
+
+                siCloseAllOpenScopes();
+            }
+        }
+
+#endif // DEBUGGING_SUPPORT
+
+        genStackLevel -= savedStkLvl;
+
+#ifdef DEBUG
+        // compCurLife should be equal to the liveOut set, except that we don't keep
+        // it up to date for vars that are not register candidates
+        // (it would be nice to have a xor set function)
+
+        VARSET_TP VARSET_INIT_NOCOPY(extraLiveVars, VarSetOps::Diff(compiler, block->bbLiveOut, compiler->compCurLife));
+        VarSetOps::UnionD(compiler, extraLiveVars, VarSetOps::Diff(compiler, compiler->compCurLife, block->bbLiveOut));
+        VARSET_ITER_INIT(compiler, extraLiveVarIter, extraLiveVars, extraLiveVarIndex);
+        while (extraLiveVarIter.NextElem(compiler, &extraLiveVarIndex))
+        {
+            unsigned   varNum = compiler->lvaTrackedToVarNum[extraLiveVarIndex];
+            LclVarDsc* varDsc = compiler->lvaTable + varNum;
+            assert(!varDsc->lvIsRegCandidate());
+        }
+#endif
+
+        /* Both stacks should always be empty on exit from a basic block */
+        noway_assert(genStackLevel == 0);
+
+#ifdef _TARGET_AMD64_
+        // On AMD64, we need to generate a NOP after a call that is the last instruction of the block, in several
+        // situations, to support proper exception handling semantics. This is mostly to ensure that when the stack
+        // walker computes an instruction pointer for a frame, that instruction pointer is in the correct EH region.
+        // The document "X64 and ARM ABIs.docx" has more details. The situations:
+        // 1. If the call instruction is in a different EH region as the instruction that follows it.
+        // 2. If the call immediately precedes an OS epilog. (Note that what the JIT or VM consider an epilog might
+        //    be slightly different from what the OS considers an epilog, and it is the OS-reported epilog that matters
+        //    here.)
+        // We handle case #1 here, and case #2 in the emitter.
+        if (getEmitter()->emitIsLastInsCall())
+        {
+            // Ok, the last instruction generated is a call instruction. Do any of the other conditions hold?
+            // Note: we may be generating a few too many NOPs for the case of call preceding an epilog. Technically,
+            // if the next block is a BBJ_RETURN, an epilog will be generated, but there may be some instructions
+            // generated before the OS epilog starts, such as a GS cookie check.
+            if ((block->bbNext == nullptr) || !BasicBlock::sameEHRegion(block, block->bbNext))
+            {
+                // We only need the NOP if we're not going to generate any more code as part of the block end.
+
+                switch (block->bbJumpKind)
+                {
+                    case BBJ_ALWAYS:
+                    case BBJ_THROW:
+                    case BBJ_CALLFINALLY:
+                    case BBJ_EHCATCHRET:
+                    // We're going to generate more code below anyway, so no need for the NOP.
+
+                    case BBJ_RETURN:
+                    case BBJ_EHFINALLYRET:
+                    case BBJ_EHFILTERRET:
+                        // These are the "epilog follows" case, handled in the emitter.
+
+                        break;
+
+                    case BBJ_NONE:
+                        if (block->bbNext == nullptr)
+                        {
+                            // Call immediately before the end of the code; we should never get here    .
+                            instGen(INS_BREAKPOINT); // This should never get executed
+                        }
+                        else
+                        {
+                            // We need the NOP
+                            instGen(INS_nop);
+                        }
+                        break;
+
+                    case BBJ_COND:
+                    case BBJ_SWITCH:
+                    // These can't have a call as the last instruction!
+
+                    default:
+                        noway_assert(!"Unexpected bbJumpKind");
+                        break;
+                }
+            }
+        }
+#endif // _TARGET_AMD64_
+
+        /* Do we need to generate a jump or return? */
+
+        switch (block->bbJumpKind)
+        {
+            case BBJ_ALWAYS:
+                inst_JMP(EJ_jmp, block->bbJumpDest);
+                break;
+
+            case BBJ_RETURN:
+                genExitCode(block);
+                break;
+
+            case BBJ_THROW:
+                // If we have a throw at the end of a function or funclet, we need to emit another instruction
+                // afterwards to help the OS unwinder determine the correct context during unwind.
+                // We insert an unexecuted breakpoint instruction in several situations
+                // following a throw instruction:
+                // 1. If the throw is the last instruction of the function or funclet. This helps
+                //    the OS unwinder determine the correct context during an unwind from the
+                //    thrown exception.
+                // 2. If this is this is the last block of the hot section.
+                // 3. If the subsequent block is a special throw block.
+                // 4. On AMD64, if the next block is in a different EH region.
+                if ((block->bbNext == nullptr) || (block->bbNext->bbFlags & BBF_FUNCLET_BEG) ||
+                    !BasicBlock::sameEHRegion(block, block->bbNext) ||
+                    (!isFramePointerUsed() && compiler->fgIsThrowHlpBlk(block->bbNext)) ||
+                    block->bbNext == compiler->fgFirstColdBlock)
+                {
+                    instGen(INS_BREAKPOINT); // This should never get executed
+                }
+
+                break;
+
+            case BBJ_CALLFINALLY:
+
+#if FEATURE_EH_FUNCLETS
+
+                // Generate a call to the finally, like this:
+                //      mov         rcx,qword ptr [rbp + 20H]       // Load rcx with PSPSym
+                //      call        finally-funclet
+                //      jmp         finally-return                  // Only for non-retless finally calls
+                // The jmp can be a NOP if we're going to the next block.
+                // If we're generating code for the main function (not a funclet), and there is no localloc,
+                // then RSP at this point is the same value as that stored in the PSPsym. So just copy RSP
+                // instead of loading the PSPSym in this case.
+
+                if (!compiler->compLocallocUsed && (compiler->funCurrentFunc()->funKind == FUNC_ROOT))
+                {
+                    inst_RV_RV(INS_mov, REG_ARG_0, REG_SPBASE, TYP_I_IMPL);
+                }
+                else
+                {
+                    getEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_ARG_0, compiler->lvaPSPSym, 0);
+                }
+                getEmitter()->emitIns_J(INS_call, block->bbJumpDest);
+
+                if (block->bbFlags & BBF_RETLESS_CALL)
+                {
+                    // We have a retless call, and the last instruction generated was a call.
+                    // If the next block is in a different EH region (or is the end of the code
+                    // block), then we need to generate a breakpoint here (since it will never
+                    // get executed) to get proper unwind behavior.
+
+                    if ((block->bbNext == nullptr) || !BasicBlock::sameEHRegion(block, block->bbNext))
+                    {
+                        instGen(INS_BREAKPOINT); // This should never get executed
+                    }
+                }
+                else
+                {
+                    // Because of the way the flowgraph is connected, the liveness info for this one instruction
+                    // after the call is not (can not be) correct in cases where a variable has a last use in the
+                    // handler.  So turn off GC reporting for this single instruction.
+                    getEmitter()->emitDisableGC();
+
+                    // Now go to where the finally funclet needs to return to.
+                    if (block->bbNext->bbJumpDest == block->bbNext->bbNext)
+                    {
+                        // Fall-through.
+                        // TODO-XArch-CQ: Can we get rid of this instruction, and just have the call return directly
+                        // to the next instruction? This would depend on stack walking from within the finally
+                        // handler working without this instruction being in this special EH region.
+                        instGen(INS_nop);
+                    }
+                    else
+                    {
+                        inst_JMP(EJ_jmp, block->bbNext->bbJumpDest);
+                    }
+
+                    getEmitter()->emitEnableGC();
+                }
+
+#else // !FEATURE_EH_FUNCLETS
+
+                // If we are about to invoke a finally locally from a try block, we have to set the ShadowSP slot
+                // corresponding to the finally's nesting level. When invoked in response to an exception, the
+                // EE does this.
+                //
+                // We have a BBJ_CALLFINALLY followed by a BBJ_ALWAYS.
+                //
+                // We will emit :
+                //      mov [ebp - (n + 1)], 0
+                //      mov [ebp -  n     ], 0xFC
+                //      push &step
+                //      jmp  finallyBlock
+                // ...
+                // step:
+                //      mov [ebp -  n     ], 0
+                //      jmp leaveTarget
+                // ...
+                // leaveTarget:
+
+                noway_assert(isFramePointerUsed());
+
+                // Get the nesting level which contains the finally
+                compiler->fgGetNestingLevel(block, &finallyNesting);
+
+                // The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
+                unsigned filterEndOffsetSlotOffs;
+                filterEndOffsetSlotOffs =
+                    (unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
+
+                unsigned curNestingSlotOffs;
+                curNestingSlotOffs = (unsigned)(filterEndOffsetSlotOffs - ((finallyNesting + 1) * TARGET_POINTER_SIZE));
+
+                // Zero out the slot for the next nesting level
+                instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, 0, compiler->lvaShadowSPslotsVar,
+                                           curNestingSlotOffs - TARGET_POINTER_SIZE);
+                instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, LCL_FINALLY_MARK, compiler->lvaShadowSPslotsVar,
+                                           curNestingSlotOffs);
+
+                // Now push the address where the finally funclet should return to directly.
+                if (!(block->bbFlags & BBF_RETLESS_CALL))
+                {
+                    assert(block->isBBCallAlwaysPair());
+                    getEmitter()->emitIns_J(INS_push_hide, block->bbNext->bbJumpDest);
+                }
+                else
+                {
+                    // EE expects a DWORD, so we give him 0
+                    inst_IV(INS_push_hide, 0);
+                }
+
+                // Jump to the finally BB
+                inst_JMP(EJ_jmp, block->bbJumpDest);
+
+#endif // !FEATURE_EH_FUNCLETS
+
+                // The BBJ_ALWAYS is used because the BBJ_CALLFINALLY can't point to the
+                // jump target using bbJumpDest - that is already used to point
+                // to the finally block. So just skip past the BBJ_ALWAYS unless the
+                // block is RETLESS.
+                if (!(block->bbFlags & BBF_RETLESS_CALL))
+                {
+                    assert(block->isBBCallAlwaysPair());
+
+                    lblk  = block;
+                    block = block->bbNext;
+                }
+
+                break;
+
+#if FEATURE_EH_FUNCLETS
+
+            case BBJ_EHCATCHRET:
+                // Set RAX to the address the VM should return to after the catch.
+                // Generate a RIP-relative
+                //         lea reg, [rip + disp32] ; the RIP is implicit
+                // which will be position-indepenent.
+                getEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, block->bbJumpDest, REG_INTRET);
+                __fallthrough;
+
+            case BBJ_EHFINALLYRET:
+            case BBJ_EHFILTERRET:
+                genReserveFuncletEpilog(block);
+                break;
+
+#else // !FEATURE_EH_FUNCLETS
+
+            case BBJ_EHCATCHRET:
+                noway_assert(!"Unexpected BBJ_EHCATCHRET"); // not used on x86
+
+            case BBJ_EHFINALLYRET:
+            case BBJ_EHFILTERRET:
+            {
+                // The last statement of the block must be a GT_RETFILT, which has already been generated.
+                assert(block->lastNode() != nullptr);
+                assert(block->lastNode()->OperGet() == GT_RETFILT);
+
+                if (block->bbJumpKind == BBJ_EHFINALLYRET)
+                {
+                    assert(block->lastNode()->gtOp.gtOp1 == nullptr); // op1 == nullptr means endfinally
+
+                    // Return using a pop-jmp sequence. As the "try" block calls
+                    // the finally with a jmp, this leaves the x86 call-ret stack
+                    // balanced in the normal flow of path.
+
+                    noway_assert(isFramePointerRequired());
+                    inst_RV(INS_pop_hide, REG_EAX, TYP_I_IMPL);
+                    inst_RV(INS_i_jmp, REG_EAX, TYP_I_IMPL);
+                }
+                else
+                {
+                    assert(block->bbJumpKind == BBJ_EHFILTERRET);
+
+                    // The return value has already been computed.
+                    instGen_Return(0);
+                }
+            }
+            break;
+
+#endif // !FEATURE_EH_FUNCLETS
+
+            case BBJ_NONE:
+            case BBJ_COND:
+            case BBJ_SWITCH:
+                break;
+
+            default:
+                noway_assert(!"Unexpected bbJumpKind");
+                break;
+        }
+
+#ifdef DEBUG
+        compiler->compCurBB = nullptr;
+#endif
+
+    } //------------------ END-FOR each block of the method -------------------
+
+    /* Nothing is live at this point */
+    genUpdateLife(VarSetOps::MakeEmpty(compiler));
+
+    /* Finalize the spill  tracking logic */
+
+    regSet.rsSpillEnd();
+
+    /* Finalize the temp   tracking logic */
+
+    compiler->tmpEnd();
+
+#ifdef DEBUG
+    if (compiler->verbose)
+    {
+        printf("\n# ");
+        printf("compCycleEstimate = %6d, compSizeEstimate = %5d ", compiler->compCycleEstimate,
+               compiler->compSizeEstimate);
+        printf("%s\n", compiler->info.compFullName);
+    }
+#endif
+}
+
+// return the child that has the same reg as the dst (if any)
+// other child returned (out param) in 'other'
+GenTree* sameRegAsDst(GenTree* tree, GenTree*& other /*out*/)
+{
+    if (tree->gtRegNum == REG_NA)
+    {
+        other = nullptr;
+        return nullptr;
+    }
+
+    GenTreePtr op1 = tree->gtOp.gtOp1;
+    GenTreePtr op2 = tree->gtOp.gtOp2;
+    if (op1->gtRegNum == tree->gtRegNum)
+    {
+        other = op2;
+        return op1;
+    }
+    if (op2->gtRegNum == tree->gtRegNum)
+    {
+        other = op1;
+        return op2;
+    }
+    else
+    {
+        other = nullptr;
+        return nullptr;
+    }
+}
+
+//  Move an immediate value into an integer register
+
+void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, regNumber reg, ssize_t imm, insFlags flags)
+{
+    // reg cannot be a FP register
+    assert(!genIsValidFloatReg(reg));
+
+    if (!compiler->opts.compReloc)
+    {
+        size = EA_SIZE(size); // Strip any Reloc flags from size if we aren't doing relocs
+    }
+
+    if ((imm == 0) && !EA_IS_RELOC(size))
+    {
+        instGen_Set_Reg_To_Zero(size, reg, flags);
+    }
+    else
+    {
+        if (genDataIndirAddrCanBeEncodedAsPCRelOffset(imm))
+        {
+            getEmitter()->emitIns_R_AI(INS_lea, EA_PTR_DSP_RELOC, reg, imm);
+        }
+        else
+        {
+            getEmitter()->emitIns_R_I(INS_mov, size, reg, imm);
+        }
+    }
+    regTracker.rsTrackRegIntCns(reg, imm);
+}
+
+/***********************************************************************************
+ *
+ * Generate code to set a register 'targetReg' of type 'targetType' to the constant
+ * specified by the constant (GT_CNS_INT or GT_CNS_DBL) in 'tree'. This does not call
+ * genProduceReg() on the target register.
+ */
+void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTreePtr tree)
+{
+
+    switch (tree->gtOper)
+    {
+        case GT_CNS_INT:
+        {
+            // relocatable values tend to come down as a CNS_INT of native int type
+            // so the line between these two opcodes is kind of blurry
+            GenTreeIntConCommon* con    = tree->AsIntConCommon();
+            ssize_t              cnsVal = con->IconValue();
+
+            if (con->ImmedValNeedsReloc(compiler))
+            {
+                instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, targetReg, cnsVal);
+                regTracker.rsTrackRegTrash(targetReg);
+            }
+            else
+            {
+                genSetRegToIcon(targetReg, cnsVal, targetType);
+            }
+        }
+        break;
+
+        case GT_CNS_DBL:
+        {
+            double constValue = tree->gtDblCon.gtDconVal;
+
+            // Make sure we use "xorpd reg, reg"  only for +ve zero constant (0.0) and not for -ve zero (-0.0)
+            if (*(__int64*)&constValue == 0)
+            {
+                // A faster/smaller way to generate 0
+                instruction ins = genGetInsForOper(GT_XOR, targetType);
+                inst_RV_RV(ins, targetReg, targetReg, targetType);
+            }
+            else
+            {
+                GenTreePtr cns;
+                if (targetType == TYP_FLOAT)
+                {
+                    float f = forceCastToFloat(constValue);
+                    cns     = genMakeConst(&f, targetType, tree, false);
+                }
+                else
+                {
+                    cns = genMakeConst(&constValue, targetType, tree, true);
+                }
+
+                inst_RV_TT(ins_Load(targetType), targetReg, cns);
+            }
+        }
+        break;
+
+        default:
+            unreached();
+    }
+}
+
+// Generate code to get the high N bits of a N*N=2N bit multiplication result
+void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
+{
+    assert(!(treeNode->gtFlags & GTF_UNSIGNED));
+    assert(!treeNode->gtOverflowEx());
+
+    regNumber targetReg  = treeNode->gtRegNum;
+    var_types targetType = treeNode->TypeGet();
+    emitter*  emit       = getEmitter();
+    emitAttr  size       = emitTypeSize(treeNode);
+    GenTree*  op1        = treeNode->gtOp.gtOp1;
+    GenTree*  op2        = treeNode->gtOp.gtOp2;
+
+    // to get the high bits of the multiply, we are constrained to using the
+    // 1-op form:  RDX:RAX = RAX * rm
+    // The 3-op form (Rx=Ry*Rz) does not support it.
+
+    genConsumeOperands(treeNode->AsOp());
+
+    GenTree* regOp = op1;
+    GenTree* rmOp  = op2;
+
+    // Set rmOp to the contained memory operand (if any)
+    //
+    if (op1->isContained() || (!op2->isContained() && (op2->gtRegNum == targetReg)))
+    {
+        regOp = op2;
+        rmOp  = op1;
+    }
+    assert(!regOp->isContained());
+
+    // Setup targetReg when neither of the source operands was a matching register
+    if (regOp->gtRegNum != targetReg)
+    {
+        inst_RV_RV(ins_Copy(targetType), targetReg, regOp->gtRegNum, targetType);
+    }
+
+    emit->emitInsBinary(INS_imulEAX, size, treeNode, rmOp);
+
+    // Move the result to the desired register, if necessary
+    if (targetReg != REG_RDX)
+    {
+        inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
+    }
+}
+
+// generate code for a DIV or MOD operation
+//
+void CodeGen::genCodeForDivMod(GenTreeOp* treeNode)
+{
+    GenTree*   dividend   = treeNode->gtOp1;
+    GenTree*   divisor    = treeNode->gtOp2;
+    genTreeOps oper       = treeNode->OperGet();
+    emitAttr   size       = emitTypeSize(treeNode);
+    regNumber  targetReg  = treeNode->gtRegNum;
+    var_types  targetType = treeNode->TypeGet();
+    emitter*   emit       = getEmitter();
+
+    // dividend is not contained.
+    assert(!dividend->isContained());
+
+    genConsumeOperands(treeNode->AsOp());
+    if (varTypeIsFloating(targetType))
+    {
+        // divisor is not contained or if contained is a memory op.
+        // Note that a reg optional operand is a treated as a memory op
+        // if no register is allocated to it.
+        assert(!divisor->isContained() || divisor->isMemoryOp() || divisor->IsCnsFltOrDbl() ||
+               divisor->IsRegOptional());
+
+        // Floating point div/rem operation
+        assert(oper == GT_DIV || oper == GT_MOD);
+
+        if (dividend->gtRegNum == targetReg)
+        {
+            emit->emitInsBinary(genGetInsForOper(treeNode->gtOper, targetType), size, treeNode, divisor);
+        }
+        else if (!divisor->isContained() && divisor->gtRegNum == targetReg)
+        {
+            // It is not possible to generate 2-operand divss or divsd where reg2 = reg1 / reg2
+            // because divss/divsd reg1, reg2 will over-write reg1.  Therefore, in case of AMD64
+            // LSRA has to make sure that such a register assignment is not generated for floating
+            // point div/rem operations.
+            noway_assert(
+                !"GT_DIV/GT_MOD (float): case of reg2 = reg1 / reg2, LSRA should never generate such a reg assignment");
+        }
+        else
+        {
+            inst_RV_RV(ins_Copy(targetType), targetReg, dividend->gtRegNum, targetType);
+            emit->emitInsBinary(genGetInsForOper(treeNode->gtOper, targetType), size, treeNode, divisor);
+        }
+    }
+    else
+    {
+        // dividend must be in RAX
+        if (dividend->gtRegNum != REG_RAX)
+        {
+            inst_RV_RV(INS_mov, REG_RAX, dividend->gtRegNum, targetType);
+        }
+
+        // zero or sign extend rax to rdx
+        if (oper == GT_UMOD || oper == GT_UDIV)
+        {
+            instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
+        }
+        else
+        {
+            emit->emitIns(INS_cdq, size);
+            // the cdq instruction writes RDX, So clear the gcInfo for RDX
+            gcInfo.gcMarkRegSetNpt(RBM_RDX);
+        }
+
+        // Perform the 'targetType' (64-bit or 32-bit) divide instruction
+        instruction ins;
+        if (oper == GT_UMOD || oper == GT_UDIV)
+        {
+            ins = INS_div;
+        }
+        else
+        {
+            ins = INS_idiv;
+        }
+
+        emit->emitInsBinary(ins, size, treeNode, divisor);
+
+        // DIV/IDIV instructions always store the quotient in RAX and the remainder in RDX.
+        // Move the result to the desired register, if necessary
+        if (oper == GT_DIV || oper == GT_UDIV)
+        {
+            if (targetReg != REG_RAX)
+            {
+                inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
+            }
+        }
+        else
+        {
+            assert((oper == GT_MOD) || (oper == GT_UMOD));
+            if (targetReg != REG_RDX)
+            {
+                inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
+            }
+        }
+    }
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genCodeForBinary: Generate code for many binary arithmetic operators
+// This method is expected to have called genConsumeOperands() before calling it.
+//
+// Arguments:
+//    treeNode - The binary operation for which we are generating code.
+//
+// Return Value:
+//    None.
+//
+// Notes:
+//    Mul and div variants have special constraints on x64 so are not handled here.
+//    See teh assert below for the operators that are handled.
+
+void CodeGen::genCodeForBinary(GenTree* treeNode)
+{
+    const genTreeOps oper       = treeNode->OperGet();
+    regNumber        targetReg  = treeNode->gtRegNum;
+    var_types        targetType = treeNode->TypeGet();
+    emitter*         emit       = getEmitter();
+
+#if defined(_TARGET_64BIT_)
+    assert(oper == GT_OR || oper == GT_XOR || oper == GT_AND || oper == GT_ADD || oper == GT_SUB);
+#else  // !defined(_TARGET_64BIT_)
+    assert(oper == GT_OR || oper == GT_XOR || oper == GT_AND || oper == GT_ADD_LO || oper == GT_ADD_HI ||
+           oper == GT_SUB_LO || oper == GT_SUB_HI || oper == GT_MUL_HI || oper == GT_DIV_HI || oper == GT_MOD_HI ||
+           oper == GT_ADD || oper == GT_SUB);
+#endif // !defined(_TARGET_64BIT_)
+
+    GenTreePtr op1 = treeNode->gtGetOp1();
+    GenTreePtr op2 = treeNode->gtGetOp2();
+
+    // Commutative operations can mark op1 as contained to generate "op reg, memop/immed"
+    if (op1->isContained())
+    {
+        assert(treeNode->OperIsCommutative());
+        assert(op1->isMemoryOp() || op1->IsCnsNonZeroFltOrDbl() || op1->IsIntCnsFitsInI32() || op1->IsRegOptional());
+
+        op1 = treeNode->gtGetOp2();
+        op2 = treeNode->gtGetOp1();
+    }
+
+    instruction ins = genGetInsForOper(treeNode->OperGet(), targetType);
+
+    // The arithmetic node must be sitting in a register (since it's not contained)
+    noway_assert(targetReg != REG_NA);
+
+    regNumber op1reg = op1->isContained() ? REG_NA : op1->gtRegNum;
+    regNumber op2reg = op2->isContained() ? REG_NA : op2->gtRegNum;
+
+    GenTreePtr dst;
+    GenTreePtr src;
+
+    // This is the case of reg1 = reg1 op reg2
+    // We're ready to emit the instruction without any moves
+    if (op1reg == targetReg)
+    {
+        dst = op1;
+        src = op2;
+    }
+    // We have reg1 = reg2 op reg1
+    // In order for this operation to be correct
+    // we need that op is a commutative operation so
+    // we can convert it into reg1 = reg1 op reg2 and emit
+    // the same code as above
+    else if (op2reg == targetReg)
+    {
+        noway_assert(GenTree::OperIsCommutative(oper));
+        dst = op2;
+        src = op1;
+    }
+    // now we know there are 3 different operands so attempt to use LEA
+    else if (oper == GT_ADD && !varTypeIsFloating(treeNode) && !treeNode->gtOverflowEx() // LEA does not set flags
+             && (op2->isContainedIntOrIImmed() || !op2->isContained()))
+    {
+        if (op2->isContainedIntOrIImmed())
+        {
+            emit->emitIns_R_AR(INS_lea, emitTypeSize(treeNode), targetReg, op1reg,
+                               (int)op2->AsIntConCommon()->IconValue());
+        }
+        else
+        {
+            assert(op2reg != REG_NA);
+            emit->emitIns_R_ARX(INS_lea, emitTypeSize(treeNode), targetReg, op1reg, op2reg, 1, 0);
+        }
+        genProduceReg(treeNode);
+        return;
+    }
+    // dest, op1 and op2 registers are different:
+    // reg3 = reg1 op reg2
+    // We can implement this by issuing a mov:
+    // reg3 = reg1
+    // reg3 = reg3 op reg2
+    else
+    {
+        inst_RV_RV(ins_Copy(targetType), targetReg, op1reg, targetType);
+        regTracker.rsTrackRegCopy(targetReg, op1reg);
+        gcInfo.gcMarkRegPtrVal(targetReg, targetType);
+        dst = treeNode;
+        src = op2;
+    }
+
+    // try to use an inc or dec
+    if (oper == GT_ADD && !varTypeIsFloating(treeNode) && src->isContainedIntOrIImmed() && !treeNode->gtOverflowEx())
+    {
+        if (src->IsIntegralConst(1))
+        {
+            emit->emitIns_R(INS_inc, emitTypeSize(treeNode), targetReg);
+            genProduceReg(treeNode);
+            return;
+        }
+        else if (src->IsIntegralConst(-1))
+        {
+            emit->emitIns_R(INS_dec, emitTypeSize(treeNode), targetReg);
+            genProduceReg(treeNode);
+            return;
+        }
+    }
+    regNumber r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src);
+    noway_assert(r == targetReg);
+
+    if (treeNode->gtOverflowEx())
+    {
+#if !defined(_TARGET_64BIT_)
+        assert(oper == GT_ADD || oper == GT_SUB || oper == GT_ADD_HI || oper == GT_SUB_HI);
+#else
+        assert(oper == GT_ADD || oper == GT_SUB);
+#endif
+        genCheckOverflow(treeNode);
+    }
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// isStructReturn: Returns whether the 'treeNode' is returning a struct.
+//
+// Arguments:
+//    treeNode - The tree node to evaluate whether is a struct return.
+//
+// Return Value:
+//    For AMD64 *nix: returns true if the 'treeNode" is a GT_RETURN node, of type struct.
+//                    Otherwise returns false.
+//    For other platforms always returns false.
+//
+bool CodeGen::isStructReturn(GenTreePtr treeNode)
+{
+    // This method could be called for 'treeNode' of GT_RET_FILT or GT_RETURN.
+    // For the GT_RET_FILT, the return is always
+    // a bool or a void, for the end of a finally block.
+    noway_assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
+    if (treeNode->OperGet() != GT_RETURN)
+    {
+        return false;
+    }
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    return varTypeIsStruct(treeNode);
+#else  // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+    assert(!varTypeIsStruct(treeNode));
+    return false;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+}
+
+//------------------------------------------------------------------------
+// genStructReturn: Generates code for returning a struct.
+//
+// Arguments:
+//    treeNode - The GT_RETURN tree node.
+//
+// Return Value:
+//    None
+//
+// Assumption:
+//    op1 of GT_RETURN node is either GT_LCL_VAR or multi-reg GT_CALL
+void CodeGen::genStructReturn(GenTreePtr treeNode)
+{
+    assert(treeNode->OperGet() == GT_RETURN);
+    GenTreePtr op1 = treeNode->gtGetOp1();
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    if (op1->OperGet() == GT_LCL_VAR)
+    {
+        GenTreeLclVarCommon* lclVar = op1->AsLclVarCommon();
+        LclVarDsc*           varDsc = &(compiler->lvaTable[lclVar->gtLclNum]);
+        assert(varDsc->lvIsMultiRegRet);
+
+        ReturnTypeDesc retTypeDesc;
+        retTypeDesc.InitializeStructReturnType(compiler, varDsc->lvVerTypeInfo.GetClassHandle());
+        unsigned regCount = retTypeDesc.GetReturnRegCount();
+        assert(regCount == MAX_RET_REG_COUNT);
+
+        if (varTypeIsEnregisterableStruct(op1))
+        {
+            // Right now the only enregistrable structs supported are SIMD vector types.
+            assert(varTypeIsSIMD(op1));
+            assert(!op1->isContained());
+
+            // This is a case of operand is in a single reg and needs to be
+            // returned in multiple ABI return registers.
+            regNumber opReg = genConsumeReg(op1);
+            regNumber reg0  = retTypeDesc.GetABIReturnReg(0);
+            regNumber reg1  = retTypeDesc.GetABIReturnReg(1);
+
+            if (opReg != reg0 && opReg != reg1)
+            {
+                // Operand reg is different from return regs.
+                // Copy opReg to reg0 and let it to be handled by one of the
+                // two cases below.
+                inst_RV_RV(ins_Copy(TYP_DOUBLE), reg0, opReg, TYP_DOUBLE);
+                opReg = reg0;
+            }
+
+            if (opReg == reg0)
+            {
+                assert(opReg != reg1);
+
+                // reg0 - already has required 8-byte in bit position [63:0].
+                // reg1 = opReg.
+                // swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
+                inst_RV_RV(ins_Copy(TYP_DOUBLE), reg1, opReg, TYP_DOUBLE);
+            }
+            else
+            {
+                assert(opReg == reg1);
+
+                // reg0 = opReg.
+                // swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
+                inst_RV_RV(ins_Copy(TYP_DOUBLE), reg0, opReg, TYP_DOUBLE);
+            }
+            inst_RV_RV_IV(INS_shufpd, EA_16BYTE, reg1, reg1, 0x01);
+        }
+        else
+        {
+            assert(op1->isContained());
+
+            // Copy var on stack into ABI return registers
+            int offset = 0;
+            for (unsigned i = 0; i < regCount; ++i)
+            {
+                var_types type = retTypeDesc.GetReturnRegType(i);
+                regNumber reg  = retTypeDesc.GetABIReturnReg(i);
+                getEmitter()->emitIns_R_S(ins_Load(type), emitTypeSize(type), reg, lclVar->gtLclNum, offset);
+                offset += genTypeSize(type);
+            }
+        }
+    }
+    else
+    {
+        assert(op1->IsMultiRegCall() || op1->IsCopyOrReloadOfMultiRegCall());
+
+        genConsumeRegs(op1);
+
+        GenTree*        actualOp1   = op1->gtSkipReloadOrCopy();
+        GenTreeCall*    call        = actualOp1->AsCall();
+        ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+        unsigned        regCount    = retTypeDesc->GetReturnRegCount();
+        assert(regCount == MAX_RET_REG_COUNT);
+
+        // Handle circular dependency between call allocated regs and ABI return regs.
+        //
+        // It is possible under LSRA stress that originally allocated regs of call node,
+        // say rax and rdx, are spilled and reloaded to rdx and rax respectively.  But
+        // GT_RETURN needs to  move values as follows: rdx->rax, rax->rdx. Similar kind
+        // kind of circular dependency could arise between xmm0 and xmm1 return regs.
+        // Codegen is expected to handle such circular dependency.
+        //
+        var_types regType0      = retTypeDesc->GetReturnRegType(0);
+        regNumber returnReg0    = retTypeDesc->GetABIReturnReg(0);
+        regNumber allocatedReg0 = call->GetRegNumByIdx(0);
+
+        var_types regType1      = retTypeDesc->GetReturnRegType(1);
+        regNumber returnReg1    = retTypeDesc->GetABIReturnReg(1);
+        regNumber allocatedReg1 = call->GetRegNumByIdx(1);
+
+        if (op1->IsCopyOrReload())
+        {
+            // GT_COPY/GT_RELOAD will have valid reg for those positions
+            // that need to be copied or reloaded.
+            regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(0);
+            if (reloadReg != REG_NA)
+            {
+                allocatedReg0 = reloadReg;
+            }
+
+            reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(1);
+            if (reloadReg != REG_NA)
+            {
+                allocatedReg1 = reloadReg;
+            }
+        }
+
+        if (allocatedReg0 == returnReg1 && allocatedReg1 == returnReg0)
+        {
+            // Circular dependency - swap allocatedReg0 and allocatedReg1
+            if (varTypeIsFloating(regType0))
+            {
+                assert(varTypeIsFloating(regType1));
+
+                // The fastest way to swap two XMM regs is using PXOR
+                inst_RV_RV(INS_pxor, allocatedReg0, allocatedReg1, TYP_DOUBLE);
+                inst_RV_RV(INS_pxor, allocatedReg1, allocatedReg0, TYP_DOUBLE);
+                inst_RV_RV(INS_pxor, allocatedReg0, allocatedReg1, TYP_DOUBLE);
+            }
+            else
+            {
+                assert(varTypeIsIntegral(regType0));
+                assert(varTypeIsIntegral(regType1));
+                inst_RV_RV(INS_xchg, allocatedReg1, allocatedReg0, TYP_I_IMPL);
+            }
+        }
+        else if (allocatedReg1 == returnReg0)
+        {
+            // Change the order of moves to correctly handle dependency.
+            if (allocatedReg1 != returnReg1)
+            {
+                inst_RV_RV(ins_Copy(regType1), returnReg1, allocatedReg1, regType1);
+            }
+
+            if (allocatedReg0 != returnReg0)
+            {
+                inst_RV_RV(ins_Copy(regType0), returnReg0, allocatedReg0, regType0);
+            }
+        }
+        else
+        {
+            // No circular dependency case.
+            if (allocatedReg0 != returnReg0)
+            {
+                inst_RV_RV(ins_Copy(regType0), returnReg0, allocatedReg0, regType0);
+            }
+
+            if (allocatedReg1 != returnReg1)
+            {
+                inst_RV_RV(ins_Copy(regType1), returnReg1, allocatedReg1, regType1);
+            }
+        }
+    }
+#else
+    unreached();
+#endif
+}
+
+//------------------------------------------------------------------------
+// genReturn: Generates code for return statement.
+//            In case of struct return, delegates to the genStructReturn method.
+//
+// Arguments:
+//    treeNode - The GT_RETURN or GT_RETFILT tree node.
+//
+// Return Value:
+//    None
+//
+void CodeGen::genReturn(GenTreePtr treeNode)
+{
+    assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
+    GenTreePtr op1        = treeNode->gtGetOp1();
+    var_types  targetType = treeNode->TypeGet();
+
+#ifdef DEBUG
+    if (targetType == TYP_VOID)
+    {
+        assert(op1 == nullptr);
+    }
+#endif
+
+#ifdef _TARGET_X86_
+    if (treeNode->TypeGet() == TYP_LONG)
+    {
+        assert(op1 != nullptr);
+        noway_assert(op1->OperGet() == GT_LONG);
+        GenTree* loRetVal = op1->gtGetOp1();
+        GenTree* hiRetVal = op1->gtGetOp2();
+        noway_assert((loRetVal->gtRegNum != REG_NA) && (hiRetVal->gtRegNum != REG_NA));
+
+        genConsumeReg(loRetVal);
+        genConsumeReg(hiRetVal);
+        if (loRetVal->gtRegNum != REG_LNGRET_LO)
+        {
+            inst_RV_RV(ins_Copy(targetType), REG_LNGRET_LO, loRetVal->gtRegNum, TYP_INT);
+        }
+        if (hiRetVal->gtRegNum != REG_LNGRET_HI)
+        {
+            inst_RV_RV(ins_Copy(targetType), REG_LNGRET_HI, hiRetVal->gtRegNum, TYP_INT);
+        }
+    }
+    else
+#endif // !defined(_TARGET_X86_)
+    {
+        if (isStructReturn(treeNode))
+        {
+            genStructReturn(treeNode);
+        }
+        else if (targetType != TYP_VOID)
+        {
+            assert(op1 != nullptr);
+            noway_assert(op1->gtRegNum != REG_NA);
+
+            // !! NOTE !! genConsumeReg will clear op1 as GC ref after it has
+            // consumed a reg for the operand. This is because the variable
+            // is dead after return. But we are issuing more instructions
+            // like "profiler leave callback" after this consumption. So
+            // if you are issuing more instructions after this point,
+            // remember to keep the variable live up until the new method
+            // exit point where it is actually dead.
+            genConsumeReg(op1);
+
+            regNumber retReg = varTypeIsFloating(treeNode) ? REG_FLOATRET : REG_INTRET;
+#ifdef _TARGET_X86_
+            if (varTypeIsFloating(treeNode))
+            {
+                // Spill the return value register from an XMM register to the stack, then load it on the x87 stack.
+                // If it already has a home location, use that. Otherwise, we need a temp.
+                if (genIsRegCandidateLocal(op1) && compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvOnFrame)
+                {
+                    // Store local variable to its home location, if necessary.
+                    if ((op1->gtFlags & GTF_REG_VAL) != 0)
+                    {
+                        op1->gtFlags &= ~GTF_REG_VAL;
+                        inst_TT_RV(ins_Store(op1->gtType,
+                                             compiler->isSIMDTypeLocalAligned(op1->gtLclVarCommon.gtLclNum)),
+                                   op1, op1->gtRegNum);
+                    }
+                    // Now, load it to the fp stack.
+                    getEmitter()->emitIns_S(INS_fld, emitTypeSize(op1), op1->AsLclVarCommon()->gtLclNum, 0);
+                }
+                else
+                {
+                    // Spill the value, which should be in a register, then load it to the fp stack.
+                    // TODO-X86-CQ: Deal with things that are already in memory (don't call genConsumeReg yet).
+                    op1->gtFlags |= GTF_SPILL;
+                    regSet.rsSpillTree(op1->gtRegNum, op1);
+                    op1->gtFlags |= GTF_SPILLED;
+                    op1->gtFlags &= ~GTF_SPILL;
+
+                    TempDsc* t = regSet.rsUnspillInPlace(op1, op1->gtRegNum);
+                    inst_FS_ST(INS_fld, emitActualTypeSize(op1->gtType), t, 0);
+                    op1->gtFlags &= ~GTF_SPILLED;
+                    compiler->tmpRlsTemp(t);
+                }
+            }
+            else
+#endif // _TARGET_X86_
+            {
+                if (op1->gtRegNum != retReg)
+                {
+                    inst_RV_RV(ins_Copy(targetType), retReg, op1->gtRegNum, targetType);
+                }
+            }
+        }
+    }
+
+#ifdef PROFILING_SUPPORTED
+    // !! Note !!
+    // TODO-AMD64-Unix: If the profiler hook is implemented on *nix, make sure for 2 register returned structs
+    //                  the RAX and RDX needs to be kept alive. Make the necessary changes in lowerxarch.cpp
+    //                  in the handling of the GT_RETURN statement.
+    //                  Such structs containing GC pointers need to be handled by calling gcInfo.gcMarkRegSetNpt
+    //                  for the return registers containing GC refs.
+
+    // There will be a single return block while generating profiler ELT callbacks.
+    //
+    // Reason for not materializing Leave callback as a GT_PROF_HOOK node after GT_RETURN:
+    // In flowgraph and other places assert that the last node of a block marked as
+    // GT_RETURN is either a GT_RETURN or GT_JMP or a tail call.  It would be nice to
+    // maintain such an invariant irrespective of whether profiler hook needed or not.
+    // Also, there is not much to be gained by materializing it as an explicit node.
+    if (compiler->compCurBB == compiler->genReturnBB)
+    {
+        // !! NOTE !!
+        // Since we are invalidating the assumption that we would slip into the epilog
+        // right after the "return", we need to preserve the return reg's GC state
+        // across the call until actual method return.
+        if (varTypeIsGC(compiler->info.compRetType))
+        {
+            gcInfo.gcMarkRegPtrVal(REG_INTRET, compiler->info.compRetType);
+        }
+
+        genProfilingLeaveCallback();
+
+        if (varTypeIsGC(compiler->info.compRetType))
+        {
+            gcInfo.gcMarkRegSetNpt(REG_INTRET);
+        }
+    }
+#endif
+}
+
+/*****************************************************************************
+ *
+ * Generate code for a single node in the tree.
+ * Preconditions: All operands have been evaluated
+ *
+ */
+void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
+{
+    regNumber targetReg;
+#if !defined(_TARGET_64BIT_)
+    if (treeNode->TypeGet() == TYP_LONG)
+    {
+        // All long enregistered nodes will have been decomposed into their
+        // constituent lo and hi nodes.
+        targetReg = REG_NA;
+    }
+    else
+#endif // !defined(_TARGET_64BIT_)
+    {
+        targetReg = treeNode->gtRegNum;
+    }
+    var_types targetType = treeNode->TypeGet();
+    emitter*  emit       = getEmitter();
+
+#ifdef DEBUG
+    // Validate that all the operands for the current node are consumed in order.
+    // This is important because LSRA ensures that any necessary copies will be
+    // handled correctly.
+    lastConsumedNode = nullptr;
+    if (compiler->verbose)
+    {
+        unsigned seqNum = treeNode->gtSeqNum; // Useful for setting a conditional break in Visual Studio
+        printf("Generating: ");
+        compiler->gtDispTree(treeNode, nullptr, nullptr, true);
+    }
+#endif // DEBUG
+
+    // Is this a node whose value is already in a register?  LSRA denotes this by
+    // setting the GTF_REUSE_REG_VAL flag.
+    if (treeNode->IsReuseRegVal())
+    {
+        // For now, this is only used for constant nodes.
+        assert((treeNode->OperIsConst()));
+        JITDUMP("  TreeNode is marked ReuseReg\n");
+        return;
+    }
+
+    // contained nodes are part of their parents for codegen purposes
+    // ex : immediates, most LEAs
+    if (treeNode->isContained())
+    {
+        return;
+    }
+
+    switch (treeNode->gtOper)
+    {
+        case GT_START_NONGC:
+            getEmitter()->emitDisableGC();
+            break;
+
+        case GT_PROF_HOOK:
+#ifdef PROFILING_SUPPORTED
+            // We should be seeing this only if profiler hook is needed
+            noway_assert(compiler->compIsProfilerHookNeeded());
+
+            // Right now this node is used only for tail calls. In future if
+            // we intend to use it for Enter or Leave hooks, add a data member
+            // to this node indicating the kind of profiler hook. For example,
+            // helper number can be used.
+            genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL);
+#endif // PROFILING_SUPPORTED
+            break;
+
+        case GT_LCLHEAP:
+            genLclHeap(treeNode);
+            break;
+
+        case GT_CNS_INT:
+#ifdef _TARGET_X86_
+            NYI_IF(treeNode->IsIconHandle(GTF_ICON_TLS_HDL), "TLS constants");
+#endif // _TARGET_X86_
+            __fallthrough;
+
+        case GT_CNS_DBL:
+            genSetRegToConst(targetReg, targetType, treeNode);
+            genProduceReg(treeNode);
+            break;
+
+        case GT_NEG:
+        case GT_NOT:
+            if (varTypeIsFloating(targetType))
+            {
+                assert(treeNode->gtOper == GT_NEG);
+                genSSE2BitwiseOp(treeNode);
+            }
+            else
+            {
+                GenTreePtr operand = treeNode->gtGetOp1();
+                assert(!operand->isContained());
+                regNumber operandReg = genConsumeReg(operand);
+
+                if (operandReg != targetReg)
+                {
+                    inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
+                }
+
+                instruction ins = genGetInsForOper(treeNode->OperGet(), targetType);
+                inst_RV(ins, targetReg, targetType);
+            }
+            genProduceReg(treeNode);
+            break;
+
+        case GT_OR:
+        case GT_XOR:
+        case GT_AND:
+            assert(varTypeIsIntegralOrI(treeNode));
+            __fallthrough;
+
+#if !defined(_TARGET_64BIT_)
+        case GT_ADD_LO:
+        case GT_ADD_HI:
+        case GT_SUB_LO:
+        case GT_SUB_HI:
+#endif // !defined(_TARGET_64BIT_)
+        case GT_ADD:
+        case GT_SUB:
+            genConsumeOperands(treeNode->AsOp());
+            genCodeForBinary(treeNode);
+            break;
+
+        case GT_LSH:
+        case GT_RSH:
+        case GT_RSZ:
+        case GT_ROL:
+        case GT_ROR:
+            genCodeForShift(treeNode);
+            // genCodeForShift() calls genProduceReg()
+            break;
+
+        case GT_CAST:
+#if !defined(_TARGET_64BIT_)
+            // We will NYI in DecomposeNode() if we are cast TO a long type, but we do not
+            // yet support casting FROM a long type either, and that's simpler to catch
+            // here.
+            NYI_IF(varTypeIsLong(treeNode->gtOp.gtOp1), "Casts from TYP_LONG");
+#endif // !defined(_TARGET_64BIT_)
+
+            if (varTypeIsFloating(targetType) && varTypeIsFloating(treeNode->gtOp.gtOp1))
+            {
+                // Casts float/double <--> double/float
+                genFloatToFloatCast(treeNode);
+            }
+            else if (varTypeIsFloating(treeNode->gtOp.gtOp1))
+            {
+                // Casts float/double --> int32/int64
+                genFloatToIntCast(treeNode);
+            }
+            else if (varTypeIsFloating(targetType))
+            {
+                // Casts int32/uint32/int64/uint64 --> float/double
+                genIntToFloatCast(treeNode);
+            }
+            else
+            {
+                // Casts int <--> int
+                genIntToIntCast(treeNode);
+            }
+            // The per-case functions call genProduceReg()
+            break;
+
+        case GT_LCL_VAR:
+        {
+            // lcl_vars are not defs
+            assert((treeNode->gtFlags & GTF_VAR_DEF) == 0);
+
+            GenTreeLclVarCommon* lcl            = treeNode->AsLclVarCommon();
+            bool                 isRegCandidate = compiler->lvaTable[lcl->gtLclNum].lvIsRegCandidate();
+
+            if (isRegCandidate && !(treeNode->gtFlags & GTF_VAR_DEATH))
+            {
+                assert((treeNode->InReg()) || (treeNode->gtFlags & GTF_SPILLED));
+            }
+
+            // If this is a register candidate that has been spilled, genConsumeReg() will
+            // reload it at the point of use.  Otherwise, if it's not in a register, we load it here.
+
+            if (!treeNode->InReg() && !(treeNode->gtFlags & GTF_SPILLED))
+            {
+                assert(!isRegCandidate);
+
+                emit->emitIns_R_S(ins_Load(treeNode->TypeGet(), compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)),
+                                  emitTypeSize(treeNode), treeNode->gtRegNum, lcl->gtLclNum, 0);
+                genProduceReg(treeNode);
+            }
+        }
+        break;
+
+        case GT_LCL_FLD_ADDR:
+        case GT_LCL_VAR_ADDR:
+            // Address of a local var.  This by itself should never be allocated a register.
+            // If it is worth storing the address in a register then it should be cse'ed into
+            // a temp and that would be allocated a register.
+            noway_assert(targetType == TYP_BYREF);
+            noway_assert(!treeNode->InReg());
+
+            inst_RV_TT(INS_lea, targetReg, treeNode, 0, EA_BYREF);
+            genProduceReg(treeNode);
+            break;
+
+        case GT_LCL_FLD:
+        {
+            noway_assert(targetType != TYP_STRUCT);
+            noway_assert(treeNode->gtRegNum != REG_NA);
+
+#ifdef FEATURE_SIMD
+            // Loading of TYP_SIMD12 (i.e. Vector3) field
+            if (treeNode->TypeGet() == TYP_SIMD12)
+            {
+                genLoadLclFldTypeSIMD12(treeNode);
+                break;
+            }
+#endif
+
+            emitAttr size   = emitTypeSize(targetType);
+            unsigned offs   = treeNode->gtLclFld.gtLclOffs;
+            unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
+            assert(varNum < compiler->lvaCount);
+
+            emit->emitIns_R_S(ins_Move_Extend(targetType, treeNode->InReg()), size, targetReg, varNum, offs);
+        }
+            genProduceReg(treeNode);
+            break;
+
+        case GT_STORE_LCL_FLD:
+        {
+            noway_assert(targetType != TYP_STRUCT);
+            noway_assert(!treeNode->InReg());
+            assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+
+#ifdef FEATURE_SIMD
+            // storing of TYP_SIMD12 (i.e. Vector3) field
+            if (treeNode->TypeGet() == TYP_SIMD12)
+            {
+                genStoreLclFldTypeSIMD12(treeNode);
+                break;
+            }
+#endif
+            GenTreePtr op1 = treeNode->gtGetOp1();
+            genConsumeRegs(op1);
+            emit->emitInsBinary(ins_Store(targetType), emitTypeSize(treeNode), treeNode, op1);
+        }
+        break;
+
+        case GT_STORE_LCL_VAR:
+        {
+            GenTreePtr op1 = treeNode->gtGetOp1();
+
+            // var = call, where call returns a multi-reg return value
+            // case is handled separately.
+            if (op1->gtSkipReloadOrCopy()->IsMultiRegCall())
+            {
+                genMultiRegCallStoreToLocal(treeNode);
+            }
+            else
+            {
+                noway_assert(targetType != TYP_STRUCT);
+                assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+
+                unsigned   lclNum = treeNode->AsLclVarCommon()->gtLclNum;
+                LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
+
+                // Ensure that lclVar nodes are typed correctly.
+                assert(!varDsc->lvNormalizeOnStore() || treeNode->TypeGet() == genActualType(varDsc->TypeGet()));
+
+#if !defined(_TARGET_64BIT_)
+                if (treeNode->TypeGet() == TYP_LONG)
+                {
+                    genStoreLongLclVar(treeNode);
+                    break;
+                }
+#endif // !defined(_TARGET_64BIT_)
+
+#ifdef FEATURE_SIMD
+                if (varTypeIsSIMD(targetType) && (targetReg != REG_NA) && op1->IsCnsIntOrI())
+                {
+                    // This is only possible for a zero-init.
+                    noway_assert(op1->IsIntegralConst(0));
+                    genSIMDZero(targetType, varDsc->lvBaseType, targetReg);
+                    genProduceReg(treeNode);
+                    break;
+                }
+#endif // FEATURE_SIMD
+
+                genConsumeRegs(op1);
+
+                if (treeNode->gtRegNum == REG_NA)
+                {
+                    // stack store
+                    emit->emitInsMov(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)),
+                                     emitTypeSize(targetType), treeNode);
+                    varDsc->lvRegNum = REG_STK;
+                }
+                else
+                {
+                    bool containedOp1 = op1->isContained();
+                    // Look for the case where we have a constant zero which we've marked for reuse,
+                    // but which isn't actually in the register we want.  In that case, it's better to create
+                    // zero in the target register, because an xor is smaller than a copy. Note that we could
+                    // potentially handle this in the register allocator, but we can't always catch it there
+                    // because the target may not have a register allocated for it yet.
+                    if (!containedOp1 && (op1->gtRegNum != treeNode->gtRegNum) &&
+                        (op1->IsIntegralConst(0) || op1->IsFPZero()))
+                    {
+                        op1->gtRegNum = REG_NA;
+                        op1->ResetReuseRegVal();
+                        containedOp1 = true;
+                    }
+
+                    if (containedOp1)
+                    {
+                        // Currently, we assume that the contained source of a GT_STORE_LCL_VAR writing to a register
+                        // must be a constant. However, in the future we might want to support a contained memory op.
+                        // This is a bit tricky because we have to decide it's contained before register allocation,
+                        // and this would be a case where, once that's done, we need to mark that node as always
+                        // requiring a register - which we always assume now anyway, but once we "optimize" that
+                        // we'll have to take cases like this into account.
+                        assert((op1->gtRegNum == REG_NA) && op1->OperIsConst());
+                        genSetRegToConst(treeNode->gtRegNum, targetType, op1);
+                    }
+                    else if (op1->gtRegNum != treeNode->gtRegNum)
+                    {
+                        assert(op1->gtRegNum != REG_NA);
+                        emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(treeNode), treeNode, op1);
+                    }
+                }
+            }
+
+            if (treeNode->gtRegNum != REG_NA)
+            {
+                genProduceReg(treeNode);
+            }
+        }
+        break;
+
+        case GT_RETFILT:
+            // A void GT_RETFILT is the end of a finally. For non-void filter returns we need to load the result in
+            // the return register, if it's not already there. The processing is the same as GT_RETURN.
+            if (targetType != TYP_VOID)
+            {
+                // For filters, the IL spec says the result is type int32. Further, the only specified legal values
+                // are 0 or 1, with the use of other values "undefined".
+                assert(targetType == TYP_INT);
+            }
+
+            __fallthrough;
+
+        case GT_RETURN:
+            genReturn(treeNode);
+            break;
+
+        case GT_LEA:
+        {
+            // if we are here, it is the case where there is an LEA that cannot
+            // be folded into a parent instruction
+            GenTreeAddrMode* lea = treeNode->AsAddrMode();
+            genLeaInstruction(lea);
+        }
+        // genLeaInstruction calls genProduceReg()
+        break;
+
+        case GT_IND:
+#ifdef FEATURE_SIMD
+            // Handling of Vector3 type values loaded through indirection.
+            if (treeNode->TypeGet() == TYP_SIMD12)
+            {
+                genLoadIndTypeSIMD12(treeNode);
+                break;
+            }
+#endif // FEATURE_SIMD
+
+            genConsumeAddress(treeNode->AsIndir()->Addr());
+            emit->emitInsMov(ins_Load(treeNode->TypeGet()), emitTypeSize(treeNode), treeNode);
+            genProduceReg(treeNode);
+            break;
+
+        case GT_MULHI:
+            genCodeForMulHi(treeNode->AsOp());
+            genProduceReg(treeNode);
+            break;
+
+        case GT_MUL:
+        {
+            instruction ins;
+            emitAttr    size                  = emitTypeSize(treeNode);
+            bool        isUnsignedMultiply    = ((treeNode->gtFlags & GTF_UNSIGNED) != 0);
+            bool        requiresOverflowCheck = treeNode->gtOverflowEx();
+
+            GenTree* op1 = treeNode->gtGetOp1();
+            GenTree* op2 = treeNode->gtGetOp2();
+
+            // there are 3 forms of x64 multiply:
+            // 1-op form with 128 result:  RDX:RAX = RAX * rm
+            // 2-op form: reg *= rm
+            // 3-op form: reg = rm * imm
+
+            genConsumeOperands(treeNode->AsOp());
+
+            // This matches the 'mul' lowering in Lowering::SetMulOpCounts()
+            //
+            // immOp :: Only one operand can be an immediate
+            // rmOp  :: Only one operand can be a memory op.
+            // regOp :: A register op (especially the operand that matches 'targetReg')
+            //          (can be nullptr when we have both a memory op and an immediate op)
+
+            GenTree* immOp = nullptr;
+            GenTree* rmOp  = op1;
+            GenTree* regOp;
+
+            if (op2->isContainedIntOrIImmed())
+            {
+                immOp = op2;
+            }
+            else if (op1->isContainedIntOrIImmed())
+            {
+                immOp = op1;
+                rmOp  = op2;
+            }
+
+            if (immOp != nullptr)
+            {
+                // This must be a non-floating point operation.
+                assert(!varTypeIsFloating(treeNode));
+
+                // CQ: When possible use LEA for mul by imm 3, 5 or 9
+                ssize_t imm = immOp->AsIntConCommon()->IconValue();
+
+                if (!requiresOverflowCheck && !rmOp->isContained() && ((imm == 3) || (imm == 5) || (imm == 9)))
+                {
+                    // We will use the LEA instruction to perform this multiply
+                    // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9.
+                    unsigned int scale = (unsigned int)(imm - 1);
+                    getEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->gtRegNum, rmOp->gtRegNum, scale, 0);
+                }
+                else
+                {
+                    // use the 3-op form with immediate
+                    ins = getEmitter()->inst3opImulForReg(targetReg);
+                    emit->emitInsBinary(ins, size, rmOp, immOp);
+                }
+            }
+            else // we have no contained immediate operand
+            {
+                regOp = op1;
+                rmOp  = op2;
+
+                regNumber mulTargetReg = targetReg;
+                if (isUnsignedMultiply && requiresOverflowCheck)
+                {
+                    ins          = INS_mulEAX;
+                    mulTargetReg = REG_RAX;
+                }
+                else
+                {
+                    ins = genGetInsForOper(GT_MUL, targetType);
+                }
+
+                // Set rmOp to the contain memory operand (if any)
+                // or set regOp to the op2 when it has the matching target register for our multiply op
+                //
+                if (op1->isContained() || (!op2->isContained() && (op2->gtRegNum == mulTargetReg)))
+                {
+                    regOp = op2;
+                    rmOp  = op1;
+                }
+                assert(!regOp->isContained());
+
+                // Setup targetReg when neither of the source operands was a matching register
+                if (regOp->gtRegNum != mulTargetReg)
+                {
+                    inst_RV_RV(ins_Copy(targetType), mulTargetReg, regOp->gtRegNum, targetType);
+                }
+
+                emit->emitInsBinary(ins, size, treeNode, rmOp);
+
+                // Move the result to the desired register, if necessary
+                if ((ins == INS_mulEAX) && (targetReg != REG_RAX))
+                {
+                    inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
+                }
+            }
+
+            if (requiresOverflowCheck)
+            {
+                // Overflow checking is only used for non-floating point types
+                noway_assert(!varTypeIsFloating(treeNode));
+
+                genCheckOverflow(treeNode);
+            }
+        }
+            genProduceReg(treeNode);
+            break;
+
+        case GT_MOD:
+        case GT_UDIV:
+        case GT_UMOD:
+            // We shouldn't be seeing GT_MOD on float/double args as it should get morphed into a
+            // helper call by front-end.  Similarly we shouldn't be seeing GT_UDIV and GT_UMOD
+            // on float/double args.
+            noway_assert(!varTypeIsFloating(treeNode));
+            __fallthrough;
+
+        case GT_DIV:
+            genCodeForDivMod(treeNode->AsOp());
+            break;
+
+        case GT_INTRINSIC:
+            genIntrinsic(treeNode);
+            break;
+
+#ifdef FEATURE_SIMD
+        case GT_SIMD:
+            genSIMDIntrinsic(treeNode->AsSIMD());
+            break;
+#endif // FEATURE_SIMD
+
+        case GT_CKFINITE:
+            genCkfinite(treeNode);
+            break;
+
+        case GT_EQ:
+        case GT_NE:
+        case GT_LT:
+        case GT_LE:
+        case GT_GE:
+        case GT_GT:
+        {
+            // TODO-XArch-CQ: Check if we can use the currently set flags.
+            // TODO-XArch-CQ: Check for the case where we can simply transfer the carry bit to a register
+            //         (signed < or >= where targetReg != REG_NA)
+
+            GenTreePtr op1     = treeNode->gtGetOp1();
+            var_types  op1Type = op1->TypeGet();
+
+            if (varTypeIsFloating(op1Type))
+            {
+                genCompareFloat(treeNode);
+            }
+#if !defined(_TARGET_64BIT_)
+            // X86 Long comparison
+            else if (varTypeIsLong(op1Type))
+            {
+                // When not materializing the result in a register, the compare logic is generated
+                // when we generate the GT_JTRUE.
+                if (treeNode->gtRegNum != REG_NA)
+                {
+                    genCompareLong(treeNode);
+                }
+                else
+                {
+                    // We generate the compare when we generate the GT_JTRUE, but we need to consume
+                    // the operands now.
+                    genConsumeOperands(treeNode->AsOp());
+                }
+            }
+#endif // !defined(_TARGET_64BIT_)
+            else
+            {
+                genCompareInt(treeNode);
+            }
+        }
+        break;
+
+        case GT_JTRUE:
+        {
+            GenTree* cmp = treeNode->gtOp.gtOp1;
+
+            assert(cmp->OperIsCompare());
+            assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
+
+#if !defined(_TARGET_64BIT_)
+            // For long compares, we emit special logic
+            if (varTypeIsLong(cmp->gtGetOp1()))
+            {
+                genJTrueLong(cmp);
+            }
+            else
+#endif
+            {
+                // Get the "kind" and type of the comparison.  Note that whether it is an unsigned cmp
+                // is governed by a flag NOT by the inherent type of the node
+                // TODO-XArch-CQ: Check if we can use the currently set flags.
+                emitJumpKind jumpKind[2];
+                bool         branchToTrueLabel[2];
+                genJumpKindsForTree(cmp, jumpKind, branchToTrueLabel);
+
+                BasicBlock* skipLabel = nullptr;
+                if (jumpKind[0] != EJ_NONE)
+                {
+                    BasicBlock* jmpTarget;
+                    if (branchToTrueLabel[0])
+                    {
+                        jmpTarget = compiler->compCurBB->bbJumpDest;
+                    }
+                    else
+                    {
+                        // This case arises only for ordered GT_EQ right now
+                        assert((cmp->gtOper == GT_EQ) && ((cmp->gtFlags & GTF_RELOP_NAN_UN) == 0));
+                        skipLabel = genCreateTempLabel();
+                        jmpTarget = skipLabel;
+                    }
+
+                    inst_JMP(jumpKind[0], jmpTarget);
+                }
+
+                if (jumpKind[1] != EJ_NONE)
+                {
+                    // the second conditional branch always has to be to the true label
+                    assert(branchToTrueLabel[1]);
+                    inst_JMP(jumpKind[1], compiler->compCurBB->bbJumpDest);
+                }
+
+                if (skipLabel != nullptr)
+                {
+                    genDefineTempLabel(skipLabel);
+                }
+            }
+        }
+        break;
+
+        case GT_RETURNTRAP:
+        {
+            // this is nothing but a conditional call to CORINFO_HELP_STOP_FOR_GC
+            // based on the contents of 'data'
+
+            GenTree* data = treeNode->gtOp.gtOp1;
+            genConsumeRegs(data);
+            GenTreeIntCon cns = intForm(TYP_INT, 0);
+            emit->emitInsBinary(INS_cmp, emitTypeSize(TYP_INT), data, &cns);
+
+            BasicBlock* skipLabel = genCreateTempLabel();
+
+            emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
+            inst_JMP(jmpEqual, skipLabel);
+
+            // emit the call to the EE-helper that stops for GC (or other reasons)
+            assert(treeNode->gtRsvdRegs != RBM_NONE);
+            assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+            regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+            assert(genIsValidIntReg(tmpReg));
+
+            genEmitHelperCall(CORINFO_HELP_STOP_FOR_GC, 0, EA_UNKNOWN, tmpReg);
+            genDefineTempLabel(skipLabel);
+        }
+        break;
+
+        case GT_STOREIND:
+            genStoreInd(treeNode);
+            break;
+
+        case GT_COPY:
+            // This is handled at the time we call genConsumeReg() on the GT_COPY
+            break;
+
+        case GT_SWAP:
+        {
+            // Swap is only supported for lclVar operands that are enregistered
+            // We do not consume or produce any registers.  Both operands remain enregistered.
+            // However, the gc-ness may change.
+            assert(genIsRegCandidateLocal(treeNode->gtOp.gtOp1) && genIsRegCandidateLocal(treeNode->gtOp.gtOp2));
+
+            GenTreeLclVarCommon* lcl1    = treeNode->gtOp.gtOp1->AsLclVarCommon();
+            LclVarDsc*           varDsc1 = &(compiler->lvaTable[lcl1->gtLclNum]);
+            var_types            type1   = varDsc1->TypeGet();
+            GenTreeLclVarCommon* lcl2    = treeNode->gtOp.gtOp2->AsLclVarCommon();
+            LclVarDsc*           varDsc2 = &(compiler->lvaTable[lcl2->gtLclNum]);
+            var_types            type2   = varDsc2->TypeGet();
+
+            // We must have both int or both fp regs
+            assert(!varTypeIsFloating(type1) || varTypeIsFloating(type2));
+
+            // FP swap is not yet implemented (and should have NYI'd in LSRA)
+            assert(!varTypeIsFloating(type1));
+
+            regNumber oldOp1Reg     = lcl1->gtRegNum;
+            regMaskTP oldOp1RegMask = genRegMask(oldOp1Reg);
+            regNumber oldOp2Reg     = lcl2->gtRegNum;
+            regMaskTP oldOp2RegMask = genRegMask(oldOp2Reg);
+
+            // We don't call genUpdateVarReg because we don't have a tree node with the new register.
+            varDsc1->lvRegNum = oldOp2Reg;
+            varDsc2->lvRegNum = oldOp1Reg;
+
+            // Do the xchg
+            emitAttr size = EA_PTRSIZE;
+            if (varTypeGCtype(type1) != varTypeGCtype(type2))
+            {
+                // If the type specified to the emitter is a GC type, it will swap the GC-ness of the registers.
+                // Otherwise it will leave them alone, which is correct if they have the same GC-ness.
+                size = EA_GCREF;
+            }
+            inst_RV_RV(INS_xchg, oldOp1Reg, oldOp2Reg, TYP_I_IMPL, size);
+
+            // Update the gcInfo.
+            // Manually remove these regs for the gc sets (mostly to avoid confusing duplicative dump output)
+            gcInfo.gcRegByrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
+            gcInfo.gcRegGCrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
+
+            // gcMarkRegPtrVal will do the appropriate thing for non-gc types.
+            // It will also dump the updates.
+            gcInfo.gcMarkRegPtrVal(oldOp2Reg, type1);
+            gcInfo.gcMarkRegPtrVal(oldOp1Reg, type2);
+        }
+        break;
+
+        case GT_LIST:
+        case GT_ARGPLACE:
+            // Nothing to do
+            break;
+
+        case GT_PUTARG_STK:
+            genPutArgStk(treeNode);
+            break;
+
+        case GT_PUTARG_REG:
+        {
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            noway_assert(targetType != TYP_STRUCT);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+            // commas show up here commonly, as part of a nullchk operation
+            GenTree* op1 = treeNode->gtOp.gtOp1;
+            // If child node is not already in the register we need, move it
+            genConsumeReg(op1);
+            if (treeNode->gtRegNum != op1->gtRegNum)
+            {
+                inst_RV_RV(ins_Copy(targetType), treeNode->gtRegNum, op1->gtRegNum, targetType);
+            }
+            genProduceReg(treeNode);
+        }
+        break;
+
+        case GT_CALL:
+            genCallInstruction(treeNode);
+            break;
+
+        case GT_JMP:
+            genJmpMethod(treeNode);
+            break;
+
+        case GT_LOCKADD:
+        case GT_XCHG:
+        case GT_XADD:
+            genLockedInstructions(treeNode);
+            break;
+
+        case GT_MEMORYBARRIER:
+            instGen_MemoryBarrier();
+            break;
+
+        case GT_CMPXCHG:
+        {
+            GenTreePtr location  = treeNode->gtCmpXchg.gtOpLocation;  // arg1
+            GenTreePtr value     = treeNode->gtCmpXchg.gtOpValue;     // arg2
+            GenTreePtr comparand = treeNode->gtCmpXchg.gtOpComparand; // arg3
+
+            assert(location->gtRegNum != REG_NA && location->gtRegNum != REG_RAX);
+            assert(value->gtRegNum != REG_NA && value->gtRegNum != REG_RAX);
+
+            genConsumeReg(location);
+            genConsumeReg(value);
+            genConsumeReg(comparand);
+            // comparand goes to RAX;
+            // Note that we must issue this move after the genConsumeRegs(), in case any of the above
+            // have a GT_COPY from RAX.
+            if (comparand->gtRegNum != REG_RAX)
+            {
+                inst_RV_RV(ins_Copy(comparand->TypeGet()), REG_RAX, comparand->gtRegNum, comparand->TypeGet());
+            }
+
+            // location is Rm
+            instGen(INS_lock);
+
+            emit->emitIns_AR_R(INS_cmpxchg, emitTypeSize(targetType), value->gtRegNum, location->gtRegNum, 0);
+
+            // Result is in RAX
+            if (targetReg != REG_RAX)
+            {
+                inst_RV_RV(ins_Copy(targetType), targetReg, REG_RAX, targetType);
+            }
+        }
+            genProduceReg(treeNode);
+            break;
+
+        case GT_RELOAD:
+            // do nothing - reload is just a marker.
+            // The parent node will call genConsumeReg on this which will trigger the unspill of this node's child
+            // into the register specified in this node.
+            break;
+
+        case GT_NOP:
+            break;
+
+        case GT_NO_OP:
+            if (treeNode->gtFlags & GTF_NO_OP_NO)
+            {
+                noway_assert(!"GTF_NO_OP_NO should not be set");
+            }
+            else
+            {
+                getEmitter()->emitIns_Nop(1);
+            }
+            break;
+
+        case GT_ARR_BOUNDS_CHECK:
+#ifdef FEATURE_SIMD
+        case GT_SIMD_CHK:
+#endif // FEATURE_SIMD
+            genRangeCheck(treeNode);
+            break;
+
+        case GT_PHYSREG:
+            if (treeNode->gtRegNum != treeNode->AsPhysReg()->gtSrcReg)
+            {
+                inst_RV_RV(INS_mov, treeNode->gtRegNum, treeNode->AsPhysReg()->gtSrcReg, targetType);
+
+                genTransferRegGCState(treeNode->gtRegNum, treeNode->AsPhysReg()->gtSrcReg);
+            }
+            genProduceReg(treeNode);
+            break;
+
+        case GT_PHYSREGDST:
+            break;
+
+        case GT_NULLCHECK:
+        {
+            assert(!treeNode->gtOp.gtOp1->isContained());
+            regNumber reg = genConsumeReg(treeNode->gtOp.gtOp1);
+            emit->emitIns_AR_R(INS_cmp, EA_4BYTE, reg, reg, 0);
+        }
+        break;
+
+        case GT_CATCH_ARG:
+
+            noway_assert(handlerGetsXcptnObj(compiler->compCurBB->bbCatchTyp));
+
+            /* Catch arguments get passed in a register. genCodeForBBlist()
+               would have marked it as holding a GC object, but not used. */
+
+            noway_assert(gcInfo.gcRegGCrefSetCur & RBM_EXCEPTION_OBJECT);
+            genConsumeReg(treeNode);
+            break;
+
+#if !FEATURE_EH_FUNCLETS
+        case GT_END_LFIN:
+
+            // Have to clear the ShadowSP of the nesting level which encloses the finally. Generates:
+            //     mov dword ptr [ebp-0xC], 0  // for some slot of the ShadowSP local var
+
+            unsigned finallyNesting;
+            finallyNesting = treeNode->gtVal.gtVal1;
+            noway_assert(treeNode->gtVal.gtVal1 < compiler->compHndBBtabCount);
+            noway_assert(finallyNesting < compiler->compHndBBtabCount);
+
+            // The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
+            unsigned filterEndOffsetSlotOffs;
+            PREFIX_ASSUME(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) >
+                          TARGET_POINTER_SIZE); // below doesn't underflow.
+            filterEndOffsetSlotOffs =
+                (unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
+
+            unsigned curNestingSlotOffs;
+            curNestingSlotOffs = filterEndOffsetSlotOffs - ((finallyNesting + 1) * TARGET_POINTER_SIZE);
+            instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, 0, compiler->lvaShadowSPslotsVar, curNestingSlotOffs);
+            break;
+#endif // !FEATURE_EH_FUNCLETS
+
+        case GT_PINVOKE_PROLOG:
+            noway_assert(((gcInfo.gcRegGCrefSetCur | gcInfo.gcRegByrefSetCur) & ~fullIntArgRegMask()) == 0);
+
+            // the runtime side requires the codegen here to be consistent
+            emit->emitDisableRandomNops();
+            break;
+
+        case GT_LABEL:
+            genPendingCallLabel       = genCreateTempLabel();
+            treeNode->gtLabel.gtLabBB = genPendingCallLabel;
+            emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, genPendingCallLabel, treeNode->gtRegNum);
+            break;
+
+        case GT_STORE_OBJ:
+            if (treeNode->OperIsCopyBlkOp() && !treeNode->AsBlk()->gtBlkOpGcUnsafe)
+            {
+                assert(treeNode->AsObj()->gtGcPtrCount != 0);
+                genCodeForCpObj(treeNode->AsObj());
+                break;
+            }
+            __fallthrough;
+
+        case GT_STORE_DYN_BLK:
+        case GT_STORE_BLK:
+            genCodeForStoreBlk(treeNode->AsBlk());
+            break;
+
+        case GT_JMPTABLE:
+            genJumpTable(treeNode);
+            break;
+
+        case GT_SWITCH_TABLE:
+            genTableBasedSwitch(treeNode);
+            break;
+
+        case GT_ARR_INDEX:
+            genCodeForArrIndex(treeNode->AsArrIndex());
+            break;
+
+        case GT_ARR_OFFSET:
+            genCodeForArrOffset(treeNode->AsArrOffs());
+            break;
+
+        case GT_CLS_VAR_ADDR:
+            getEmitter()->emitIns_R_C(INS_lea, EA_PTRSIZE, targetReg, treeNode->gtClsVar.gtClsVarHnd, 0);
+            genProduceReg(treeNode);
+            break;
+
+#if !defined(_TARGET_64BIT_)
+        case GT_LONG:
+            assert(!treeNode->isContained());
+            genConsumeRegs(treeNode);
+            break;
+#endif
+
+        case GT_IL_OFFSET:
+            // Do nothing; these nodes are simply markers for debug info.
+            break;
+
+        default:
+        {
+#ifdef DEBUG
+            char message[256];
+            sprintf(message, "Unimplemented node type %s\n", GenTree::NodeName(treeNode->OperGet()));
+#endif
+            assert(!"Unknown node in codegen");
+        }
+        break;
+    }
+}
+
+//----------------------------------------------------------------------------------
+// genMultiRegCallStoreToLocal: store multi-reg return value of a call node to a local
+//
+// Arguments:
+//    treeNode  -  Gentree of GT_STORE_LCL_VAR
+//
+// Return Value:
+//    None
+//
+// Assumption:
+//    The child of store is a multi-reg call node.
+//    genProduceReg() on treeNode is made by caller of this routine.
+//
+void CodeGen::genMultiRegCallStoreToLocal(GenTreePtr treeNode)
+{
+    assert(treeNode->OperGet() == GT_STORE_LCL_VAR);
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    // Structs of size >=9 and <=16 are returned in two return registers on x64 Unix.
+    assert(varTypeIsStruct(treeNode));
+
+    // Assumption: current x64 Unix implementation requires that a multi-reg struct
+    // var in 'var = call' is flagged as lvIsMultiRegRet to prevent it from
+    // being struct promoted.
+    unsigned   lclNum = treeNode->AsLclVarCommon()->gtLclNum;
+    LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
+    noway_assert(varDsc->lvIsMultiRegRet);
+
+    GenTree*     op1       = treeNode->gtGetOp1();
+    GenTree*     actualOp1 = op1->gtSkipReloadOrCopy();
+    GenTreeCall* call      = actualOp1->AsCall();
+    assert(call->HasMultiRegRetVal());
+
+    genConsumeRegs(op1);
+
+    ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+    assert(retTypeDesc->GetReturnRegCount() == MAX_RET_REG_COUNT);
+    unsigned regCount = retTypeDesc->GetReturnRegCount();
+
+    if (treeNode->gtRegNum != REG_NA)
+    {
+        // Right now the only enregistrable structs supported are SIMD types.
+        assert(varTypeIsSIMD(treeNode));
+        assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(0)));
+        assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(1)));
+
+        // This is a case of two 8-bytes that comprise the operand is in
+        // two different xmm registers and needs to assembled into a single
+        // xmm register.
+        regNumber targetReg = treeNode->gtRegNum;
+        regNumber reg0      = call->GetRegNumByIdx(0);
+        regNumber reg1      = call->GetRegNumByIdx(1);
+
+        if (op1->IsCopyOrReload())
+        {
+            // GT_COPY/GT_RELOAD will have valid reg for those positions
+            // that need to be copied or reloaded.
+            regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(0);
+            if (reloadReg != REG_NA)
+            {
+                reg0 = reloadReg;
+            }
+
+            reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(1);
+            if (reloadReg != REG_NA)
+            {
+                reg1 = reloadReg;
+            }
+        }
+
+        if (targetReg != reg0 && targetReg != reg1)
+        {
+            // Copy reg0 into targetReg and let it to be handled by one
+            // of the cases below.
+            inst_RV_RV(ins_Copy(TYP_DOUBLE), targetReg, reg0, TYP_DOUBLE);
+            targetReg = reg0;
+        }
+
+        if (targetReg == reg0)
+        {
+            // targeReg[63:0] = targetReg[63:0]
+            // targetReg[127:64] = reg1[127:64]
+            inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg1, 0x00);
+        }
+        else
+        {
+            assert(targetReg == reg1);
+
+            // We need two shuffles to achieve this
+            // First:
+            // targeReg[63:0] = targetReg[63:0]
+            // targetReg[127:64] = reg0[63:0]
+            //
+            // Second:
+            // targeReg[63:0] = targetReg[127:64]
+            // targetReg[127:64] = targetReg[63:0]
+            //
+            // Essentially copy low 8-bytes from reg0 to high 8-bytes of targetReg
+            // and next swap low and high 8-bytes of targetReg to have them
+            // rearranged in the right order.
+            inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg0, 0x00);
+            inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, targetReg, 0x01);
+        }
+    }
+    else
+    {
+        // Stack store
+        int offset = 0;
+        for (unsigned i = 0; i < regCount; ++i)
+        {
+            var_types type = retTypeDesc->GetReturnRegType(i);
+            regNumber reg  = call->GetRegNumByIdx(i);
+            if (op1->IsCopyOrReload())
+            {
+                // GT_COPY/GT_RELOAD will have valid reg for those positions
+                // that need to be copied or reloaded.
+                regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(i);
+                if (reloadReg != REG_NA)
+                {
+                    reg = reloadReg;
+                }
+            }
+
+            assert(reg != REG_NA);
+            getEmitter()->emitIns_S_R(ins_Store(type), emitTypeSize(type), reg, lclNum, offset);
+            offset += genTypeSize(type);
+        }
+
+        varDsc->lvRegNum = REG_STK;
+    }
+#elif defined(_TARGET_X86_)
+    // Longs are returned in two return registers on x86.
+    assert(varTypeIsLong(treeNode));
+
+    // Assumption: current x86 implementation requires that a multi-reg long
+    // var in 'var = call' is flagged as lvIsMultiRegRet to prevent it from
+    // being promoted.
+    unsigned   lclNum = treeNode->AsLclVarCommon()->gtLclNum;
+    LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
+    noway_assert(varDsc->lvIsMultiRegRet);
+
+    GenTree*     op1       = treeNode->gtGetOp1();
+    GenTree*     actualOp1 = op1->gtSkipReloadOrCopy();
+    GenTreeCall* call      = actualOp1->AsCall();
+    assert(call->HasMultiRegRetVal());
+
+    genConsumeRegs(op1);
+
+    ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+    unsigned        regCount    = retTypeDesc->GetReturnRegCount();
+    assert(regCount == MAX_RET_REG_COUNT);
+
+    // Stack store
+    int offset = 0;
+    for (unsigned i = 0; i < regCount; ++i)
+    {
+        var_types type = retTypeDesc->GetReturnRegType(i);
+        regNumber reg  = call->GetRegNumByIdx(i);
+        if (op1->IsCopyOrReload())
+        {
+            // GT_COPY/GT_RELOAD will have valid reg for those positions
+            // that need to be copied or reloaded.
+            regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(i);
+            if (reloadReg != REG_NA)
+            {
+                reg = reloadReg;
+            }
+        }
+
+        assert(reg != REG_NA);
+        getEmitter()->emitIns_S_R(ins_Store(type), emitTypeSize(type), reg, lclNum, offset);
+        offset += genTypeSize(type);
+    }
+
+    varDsc->lvRegNum            = REG_STK;
+#else  // !FEATURE_UNIX_AMD64_STRUCT_PASSING && !_TARGET_X86_
+    assert(!"Unreached");
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING && !_TARGET_X86_
+}
+
+//------------------------------------------------------------------------
+// genLclHeap: Generate code for localloc.
+//
+// Arguments:
+//      tree - the localloc tree to generate.
+//
+// Notes:
+//      Note that for x86, we don't track ESP movements while generating the localloc code.
+//      The ESP tracking is used to report stack pointer-relative GC info, which is not
+//      interesting while doing the localloc construction. Also, for functions with localloc,
+//      we have EBP frames, and EBP-relative locals, and ESP-relative accesses only for function
+//      call arguments. We store the ESP after the localloc is complete in the LocAllocSP
+//      variable. This variable is implicitly reported to the VM in the GC info (its position
+//      is defined by convention relative to other items), and is used by the GC to find the
+//      "base" stack pointer in functions with localloc.
+//
+void CodeGen::genLclHeap(GenTreePtr tree)
+{
+    assert(tree->OperGet() == GT_LCLHEAP);
+    assert(compiler->compLocallocUsed);
+
+    GenTreePtr size = tree->gtOp.gtOp1;
+    noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL));
+
+    regNumber   targetReg   = tree->gtRegNum;
+    regMaskTP   tmpRegsMask = tree->gtRsvdRegs;
+    regNumber   regCnt      = REG_NA;
+    var_types   type        = genActualType(size->gtType);
+    emitAttr    easz        = emitTypeSize(type);
+    BasicBlock* endLabel    = nullptr;
+
+#ifdef DEBUG
+    // Verify ESP
+    if (compiler->opts.compStackCheckOnRet)
+    {
+        noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC &&
+                     compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister &&
+                     compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame);
+        getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnEspCheck, 0);
+
+        BasicBlock*  esp_check = genCreateTempLabel();
+        emitJumpKind jmpEqual  = genJumpKindForOper(GT_EQ, CK_SIGNED);
+        inst_JMP(jmpEqual, esp_check);
+        getEmitter()->emitIns(INS_BREAKPOINT);
+        genDefineTempLabel(esp_check);
+    }
+#endif
+
+    noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes
+    noway_assert(genStackLevel == 0);   // Can't have anything on the stack
+
+    unsigned    stackAdjustment = 0;
+    BasicBlock* loop            = nullptr;
+
+    // compute the amount of memory to allocate to properly STACK_ALIGN.
+    size_t amount = 0;
+    if (size->IsCnsIntOrI())
+    {
+        // If size is a constant, then it must be contained.
+        assert(size->isContained());
+
+        // If amount is zero then return null in targetReg
+        amount = size->gtIntCon.gtIconVal;
+        if (amount == 0)
+        {
+            instGen_Set_Reg_To_Zero(EA_PTRSIZE, targetReg);
+            goto BAILOUT;
+        }
+
+        // 'amount' is the total number of bytes to localloc to properly STACK_ALIGN
+        amount = AlignUp(amount, STACK_ALIGN);
+    }
+    else
+    {
+        // The localloc requested memory size is non-constant.
+
+        // Put the size value in targetReg. If it is zero, bail out by returning null in targetReg.
+        genConsumeRegAndCopy(size, targetReg);
+        endLabel = genCreateTempLabel();
+        getEmitter()->emitIns_R_R(INS_test, easz, targetReg, targetReg);
+        inst_JMP(EJ_je, endLabel);
+
+        // Compute the size of the block to allocate and perform alignment.
+        // If compInitMem=true, we can reuse targetReg as regcnt,
+        // since we don't need any internal registers.
+        if (compiler->info.compInitMem)
+        {
+            assert(genCountBits(tmpRegsMask) == 0);
+            regCnt = targetReg;
+        }
+        else
+        {
+            assert(genCountBits(tmpRegsMask) >= 1);
+            regMaskTP regCntMask = genFindLowestBit(tmpRegsMask);
+            tmpRegsMask &= ~regCntMask;
+            regCnt = genRegNumFromMask(regCntMask);
+            if (regCnt != targetReg)
+            {
+                // Above, we put the size in targetReg. Now, copy it to our new temp register if necessary.
+                inst_RV_RV(INS_mov, regCnt, targetReg, size->TypeGet());
+            }
+        }
+
+        // Round up the number of bytes to allocate to a STACK_ALIGN boundary. This is done
+        // by code like:
+        //      add reg, 15
+        //      and reg, -16
+        // However, in the initialized memory case, we need the count of STACK_ALIGN-sized
+        // elements, not a byte count, after the alignment. So instead of the "and", which
+        // becomes unnecessary, generate a shift, e.g.:
+        //      add reg, 15
+        //      shr reg, 4
+
+        inst_RV_IV(INS_add, regCnt, STACK_ALIGN - 1, emitActualTypeSize(type));
+
+        if (compiler->info.compInitMem)
+        {
+            // Convert the count from a count of bytes to a loop count. We will loop once per
+            // stack alignment size, so each loop will zero 4 bytes on x86 and 16 bytes on x64.
+            // Note that we zero a single reg-size word per iteration on x86, and 2 reg-size
+            // words per iteration on x64. We will shift off all the stack alignment bits
+            // added above, so there is no need for an 'and' instruction.
+
+            // --- shr regCnt, 2 (or 4) ---
+            inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT_ALL);
+        }
+        else
+        {
+            // Otherwise, mask off the low bits to align the byte count.
+            inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type));
+        }
+    }
+
+#if FEATURE_FIXED_OUT_ARGS
+    // If we have an outgoing arg area then we must adjust the SP by popping off the
+    // outgoing arg area. We will restore it right before we return from this method.
+    //
+    // Localloc returns stack space that aligned to STACK_ALIGN bytes. The following
+    // are the cases that need to be handled:
+    //   i) Method has out-going arg area.
+    //      It is guaranteed that size of out-going arg area is STACK_ALIGN'ed (see fgMorphArgs).
+    //      Therefore, we will pop off the out-going arg area from RSP before allocating the localloc space.
+    //  ii) Method has no out-going arg area.
+    //      Nothing to pop off from the stack.
+    if (compiler->lvaOutgoingArgSpaceSize > 0)
+    {
+        assert((compiler->lvaOutgoingArgSpaceSize % STACK_ALIGN) == 0); // This must be true for the stack to remain
+                                                                        // aligned
+        inst_RV_IV(INS_add, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize, EA_PTRSIZE);
+        stackAdjustment += compiler->lvaOutgoingArgSpaceSize;
+    }
+#endif
+
+    if (size->IsCnsIntOrI())
+    {
+        // We should reach here only for non-zero, constant size allocations.
+        assert(amount > 0);
+        assert((amount % STACK_ALIGN) == 0);
+        assert((amount % REGSIZE_BYTES) == 0);
+
+        // For small allocations we will generate up to six push 0 inline
+        size_t cntRegSizedWords = amount / REGSIZE_BYTES;
+        if (cntRegSizedWords <= 6)
+        {
+            for (; cntRegSizedWords != 0; cntRegSizedWords--)
+            {
+                inst_IV(INS_push_hide, 0); // push_hide means don't track the stack
+            }
+            goto ALLOC_DONE;
+        }
+
+        bool doNoInitLessThanOnePageAlloc =
+            !compiler->info.compInitMem && (amount < compiler->eeGetPageSize()); // must be < not <=
+
+#ifdef _TARGET_X86_
+        bool needRegCntRegister = true;
+#else  // !_TARGET_X86_
+        bool needRegCntRegister = !doNoInitLessThanOnePageAlloc;
+#endif // !_TARGET_X86_
+
+        if (needRegCntRegister)
+        {
+            // If compInitMem=true, we can reuse targetReg as regcnt.
+            // Since size is a constant, regCnt is not yet initialized.
+            assert(regCnt == REG_NA);
+            if (compiler->info.compInitMem)
+            {
+                assert(genCountBits(tmpRegsMask) == 0);
+                regCnt = targetReg;
+            }
+            else
+            {
+                assert(genCountBits(tmpRegsMask) >= 1);
+                regMaskTP regCntMask = genFindLowestBit(tmpRegsMask);
+                tmpRegsMask &= ~regCntMask;
+                regCnt = genRegNumFromMask(regCntMask);
+            }
+        }
+
+        if (doNoInitLessThanOnePageAlloc)
+        {
+            // Since the size is less than a page, simply adjust ESP.
+            // ESP might already be in the guard page, so we must touch it BEFORE
+            // the alloc, not after.
+            CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef _TARGET_X86_
+            // For x86, we don't want to use "sub ESP" because we don't want the emitter to track the adjustment
+            // to ESP. So do the work in the count register.
+            // TODO-CQ: manipulate ESP directly, to share code, reduce #ifdefs, and improve CQ. This would require
+            // creating a way to temporarily turn off the emitter's tracking of ESP, maybe marking instrDescs as "don't
+            // track".
+            inst_RV_RV(INS_mov, regCnt, REG_SPBASE, TYP_I_IMPL);
+            getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
+            inst_RV_IV(INS_sub, regCnt, amount, EA_PTRSIZE);
+            inst_RV_RV(INS_mov, REG_SPBASE, regCnt, TYP_I_IMPL);
+#else  // !_TARGET_X86_
+            getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
+            inst_RV_IV(INS_sub, REG_SPBASE, amount, EA_PTRSIZE);
+#endif // !_TARGET_X86_
+
+            goto ALLOC_DONE;
+        }
+
+        // else, "mov regCnt, amount"
+
+        if (compiler->info.compInitMem)
+        {
+            // When initializing memory, we want 'amount' to be the loop count.
+            assert((amount % STACK_ALIGN) == 0);
+            amount /= STACK_ALIGN;
+        }
+
+        genSetRegToIcon(regCnt, amount, ((int)amount == amount) ? TYP_INT : TYP_LONG);
+    }
+
+    loop = genCreateTempLabel();
+    if (compiler->info.compInitMem)
+    {
+        // At this point 'regCnt' is set to the number of loop iterations for this loop, if each
+        // iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes.
+        // Since we have to zero out the allocated memory AND ensure that RSP is always valid
+        // by tickling the pages, we will just push 0's on the stack.
+
+        assert(genIsValidIntReg(regCnt));
+
+        // Loop:
+        genDefineTempLabel(loop);
+
+#if defined(_TARGET_AMD64_)
+        // Push two 8-byte zeros. This matches the 16-byte STACK_ALIGN value.
+        static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));
+        inst_IV(INS_push_hide, 0); // --- push 8-byte 0
+        inst_IV(INS_push_hide, 0); // --- push 8-byte 0
+#elif defined(_TARGET_X86_)
+        // Push a single 4-byte zero. This matches the 4-byte STACK_ALIGN value.
+        static_assert_no_msg(STACK_ALIGN == REGSIZE_BYTES);
+        inst_IV(INS_push_hide, 0); // --- push 4-byte 0
+#endif // _TARGET_X86_
+
+        // Decrement the loop counter and loop if not done.
+        inst_RV(INS_dec, regCnt, TYP_I_IMPL);
+        inst_JMP(EJ_jne, loop);
+    }
+    else
+    {
+        // At this point 'regCnt' is set to the total number of bytes to localloc.
+        //
+        // We don't need to zero out the allocated memory. However, we do have
+        // to tickle the pages to ensure that ESP is always valid and is
+        // in sync with the "stack guard page".  Note that in the worst
+        // case ESP is on the last byte of the guard page.  Thus you must
+        // touch ESP+0 first not ESP+x01000.
+        //
+        // Another subtlety is that you don't want ESP to be exactly on the
+        // boundary of the guard page because PUSH is predecrement, thus
+        // call setup would not touch the guard page but just beyond it
+        //
+        // Note that we go through a few hoops so that ESP never points to
+        // illegal pages at any time during the tickling process
+        //
+        //       neg   REGCNT
+        //       add   REGCNT, ESP      // reg now holds ultimate ESP
+        //       jb    loop             // result is smaller than orignial ESP (no wrap around)
+        //       xor   REGCNT, REGCNT,  // Overflow, pick lowest possible number
+        //  loop:
+        //       test  ESP, [ESP+0]     // tickle the page
+        //       mov   REGTMP, ESP
+        //       sub   REGTMP, PAGE_SIZE
+        //       mov   ESP, REGTMP
+        //       cmp   ESP, REGCNT
+        //       jae   loop
+        //
+        //       mov   ESP, REG
+        //  end:
+        inst_RV(INS_NEG, regCnt, TYP_I_IMPL);
+        inst_RV_RV(INS_add, regCnt, REG_SPBASE, TYP_I_IMPL);
+        inst_JMP(EJ_jb, loop);
+
+        instGen_Set_Reg_To_Zero(EA_PTRSIZE, regCnt);
+
+        genDefineTempLabel(loop);
+
+        // Tickle the decremented value, and move back to ESP,
+        // note that it has to be done BEFORE the update of ESP since
+        // ESP might already be on the guard page.  It is OK to leave
+        // the final value of ESP on the guard page
+        getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
+
+        // This is a harmless trick to avoid the emitter trying to track the
+        // decrement of the ESP - we do the subtraction in another reg instead
+        // of adjusting ESP directly.
+        assert(tmpRegsMask != RBM_NONE);
+        assert(genCountBits(tmpRegsMask) == 1);
+        regNumber regTmp = genRegNumFromMask(tmpRegsMask);
+
+        inst_RV_RV(INS_mov, regTmp, REG_SPBASE, TYP_I_IMPL);
+        inst_RV_IV(INS_sub, regTmp, compiler->eeGetPageSize(), EA_PTRSIZE);
+        inst_RV_RV(INS_mov, REG_SPBASE, regTmp, TYP_I_IMPL);
+
+        inst_RV_RV(INS_cmp, REG_SPBASE, regCnt, TYP_I_IMPL);
+        inst_JMP(EJ_jae, loop);
+
+        // Move the final value to ESP
+        inst_RV_RV(INS_mov, REG_SPBASE, regCnt);
+    }
+
+ALLOC_DONE:
+    // Re-adjust SP to allocate out-going arg area
+    if (stackAdjustment > 0)
+    {
+        assert((stackAdjustment % STACK_ALIGN) == 0); // This must be true for the stack to remain aligned
+        inst_RV_IV(INS_sub, REG_SPBASE, stackAdjustment, EA_PTRSIZE);
+    }
+
+    // Return the stackalloc'ed address in result register.
+    // TargetReg = RSP + stackAdjustment.
+    getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, targetReg, REG_SPBASE, stackAdjustment);
+
+    if (endLabel != nullptr)
+    {
+        genDefineTempLabel(endLabel);
+    }
+
+BAILOUT:
+
+    // Write the lvaLocAllocSPvar stack frame slot
+    noway_assert(compiler->lvaLocAllocSPvar != BAD_VAR_NUM);
+    getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaLocAllocSPvar, 0);
+
+#if STACK_PROBES
+    if (compiler->opts.compNeedStackProbes)
+    {
+        genGenerateStackProbe();
+    }
+#endif
+
+#ifdef DEBUG
+    // Update new ESP
+    if (compiler->opts.compStackCheckOnRet)
+    {
+        noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC &&
+                     compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister &&
+                     compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame);
+        getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnEspCheck, 0);
+    }
+#endif
+
+    genProduceReg(tree);
+}
+
+void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
+{
+    if (storeBlkNode->gtBlkOpGcUnsafe)
+    {
+        getEmitter()->emitDisableGC();
+    }
+    bool isCopyBlk = storeBlkNode->OperIsCopyBlkOp();
+
+    switch (storeBlkNode->gtBlkOpKind)
+    {
+#ifdef _TARGET_AMD64_
+        case GenTreeBlk::BlkOpKindHelper:
+            if (isCopyBlk)
+            {
+                genCodeForCpBlk(storeBlkNode);
+            }
+            else
+            {
+                genCodeForInitBlk(storeBlkNode);
+            }
+            break;
+#endif // _TARGET_AMD64_
+        case GenTreeBlk::BlkOpKindRepInstr:
+            if (isCopyBlk)
+            {
+                genCodeForCpBlkRepMovs(storeBlkNode);
+            }
+            else
+            {
+                genCodeForInitBlkRepStos(storeBlkNode);
+            }
+            break;
+        case GenTreeBlk::BlkOpKindUnroll:
+            if (isCopyBlk)
+            {
+                genCodeForCpBlkUnroll(storeBlkNode);
+            }
+            else
+            {
+                genCodeForInitBlkUnroll(storeBlkNode);
+            }
+            break;
+        default:
+            unreached();
+    }
+    if (storeBlkNode->gtBlkOpGcUnsafe)
+    {
+        getEmitter()->emitEnableGC();
+    }
+}
+
+// Generate code for InitBlk using rep stos.
+// Preconditions:
+//  The size of the buffers must be a constant and also less than INITBLK_STOS_LIMIT bytes.
+//  Any value larger than that, we'll use the helper even if both the
+//  fill byte and the size are integer constants.
+void CodeGen::genCodeForInitBlkRepStos(GenTreeBlk* initBlkNode)
+{
+    // Make sure we got the arguments of the initblk/initobj operation in the right registers
+    unsigned   size    = initBlkNode->Size();
+    GenTreePtr dstAddr = initBlkNode->Addr();
+    GenTreePtr initVal = initBlkNode->Data();
+
+#ifdef DEBUG
+    assert(!dstAddr->isContained());
+    assert(!initVal->isContained());
+#ifdef _TARGET_AMD64_
+    assert(size != 0);
+#endif
+    if (initVal->IsCnsIntOrI())
+    {
+#ifdef _TARGET_AMD64_
+        assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
+#else
+        assert(size > CPBLK_UNROLL_LIMIT);
+#endif
+    }
+
+#endif // DEBUG
+
+    genConsumeBlockOp(initBlkNode, REG_RDI, REG_RAX, REG_RCX);
+    instGen(INS_r_stosb);
+}
+
+// Generate code for InitBlk by performing a loop unroll
+// Preconditions:
+//   a) Both the size and fill byte value are integer constants.
+//   b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes.
+//
+void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* initBlkNode)
+{
+    // Make sure we got the arguments of the initblk/initobj operation in the right registers
+    unsigned   size    = initBlkNode->Size();
+    GenTreePtr dstAddr = initBlkNode->Addr();
+    GenTreePtr initVal = initBlkNode->Data();
+
+    assert(!dstAddr->isContained());
+    assert(!initVal->isContained());
+    assert(size != 0);
+    assert(size <= INITBLK_UNROLL_LIMIT);
+    assert(initVal->gtSkipReloadOrCopy()->IsCnsIntOrI());
+
+    emitter* emit = getEmitter();
+
+    genConsumeOperands(initBlkNode);
+
+    // If the initVal was moved, or spilled and reloaded to a different register,
+    // get the original initVal from below the GT_RELOAD, but only after capturing the valReg,
+    // which needs to be the new register.
+    regNumber valReg = initVal->gtRegNum;
+    initVal          = initVal->gtSkipReloadOrCopy();
+
+    unsigned offset = 0;
+
+    // Perform an unroll using SSE2 loads and stores.
+    if (size >= XMM_REGSIZE_BYTES)
+    {
+        regNumber tmpReg = genRegNumFromMask(initBlkNode->gtRsvdRegs);
+
+#ifdef DEBUG
+        assert(initBlkNode->gtRsvdRegs != RBM_NONE);
+        assert(genCountBits(initBlkNode->gtRsvdRegs) == 1);
+        assert(genIsValidFloatReg(tmpReg));
+#endif // DEBUG
+
+        if (initVal->gtIntCon.gtIconVal != 0)
+        {
+            emit->emitIns_R_R(INS_mov_i2xmm, EA_PTRSIZE, tmpReg, valReg);
+            emit->emitIns_R_R(INS_punpckldq, EA_8BYTE, tmpReg, tmpReg);
+#ifdef _TARGET_X86_
+            // For x86, we need one more to convert it from 8 bytes to 16 bytes.
+            emit->emitIns_R_R(INS_punpckldq, EA_8BYTE, tmpReg, tmpReg);
+#endif // _TARGET_X86_
+        }
+        else
+        {
+            emit->emitIns_R_R(INS_xorpd, EA_8BYTE, tmpReg, tmpReg);
+        }
+
+        // Determine how many 16 byte slots we're going to fill using SSE movs.
+        size_t slots = size / XMM_REGSIZE_BYTES;
+
+        while (slots-- > 0)
+        {
+            emit->emitIns_AR_R(INS_movdqu, EA_8BYTE, tmpReg, dstAddr->gtRegNum, offset);
+            offset += XMM_REGSIZE_BYTES;
+        }
+    }
+
+    // Fill the remainder (or a < 16 byte sized struct)
+    if ((size & 8) != 0)
+    {
+#ifdef _TARGET_X86_
+        // TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
+        emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
+        offset += 4;
+        emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
+        offset += 4;
+#else  // !_TARGET_X86_
+        emit->emitIns_AR_R(INS_mov, EA_8BYTE, valReg, dstAddr->gtRegNum, offset);
+        offset += 8;
+#endif // !_TARGET_X86_
+    }
+    if ((size & 4) != 0)
+    {
+        emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
+        offset += 4;
+    }
+    if ((size & 2) != 0)
+    {
+        emit->emitIns_AR_R(INS_mov, EA_2BYTE, valReg, dstAddr->gtRegNum, offset);
+        offset += 2;
+    }
+    if ((size & 1) != 0)
+    {
+        emit->emitIns_AR_R(INS_mov, EA_1BYTE, valReg, dstAddr->gtRegNum, offset);
+    }
+}
+
+// Generates code for InitBlk by calling the VM memset helper function.
+// Preconditions:
+// a) The size argument of the InitBlk is not an integer constant.
+// b) The size argument of the InitBlk is >= INITBLK_STOS_LIMIT bytes.
+void CodeGen::genCodeForInitBlk(GenTreeBlk* initBlkNode)
+{
+#ifdef _TARGET_AMD64_
+    // Make sure we got the arguments of the initblk operation in the right registers
+    unsigned   blockSize = initBlkNode->Size();
+    GenTreePtr dstAddr   = initBlkNode->Addr();
+    GenTreePtr initVal   = initBlkNode->Data();
+
+    assert(!dstAddr->isContained());
+    assert(!initVal->isContained());
+
+    if (blockSize != 0)
+    {
+        assert(blockSize >= CPBLK_MOVS_LIMIT);
+    }
+
+    genConsumeBlockOp(initBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
+
+    genEmitHelperCall(CORINFO_HELP_MEMSET, 0, EA_UNKNOWN);
+#else  // !_TARGET_AMD64_
+    NYI_X86("Helper call for InitBlk");
+#endif // !_TARGET_AMD64_
+}
+
+// Generate code for a load from some address + offset
+//   baseNode: tree node which can be either a local address or arbitrary node
+//   offset: distance from the baseNode from which to load
+void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* baseNode, unsigned offset)
+{
+    emitter* emit = getEmitter();
+
+    if (baseNode->OperIsLocalAddr())
+    {
+        if (baseNode->gtOper == GT_LCL_FLD_ADDR)
+        {
+            offset += baseNode->gtLclFld.gtLclOffs;
+        }
+        emit->emitIns_R_S(ins, size, dst, baseNode->gtLclVarCommon.gtLclNum, offset);
+    }
+    else
+    {
+        emit->emitIns_R_AR(ins, size, dst, baseNode->gtRegNum, offset);
+    }
+}
+
+//------------------------------------------------------------------------
+// genCodeForStoreOffset: Generate code to store a reg to [base + offset].
+//
+// Arguments:
+//      ins         - the instruction to generate.
+//      size        - the size that needs to be stored.
+//      src         - the register which needs to be stored.
+//      baseNode    - the base, relative to which to store the src register.
+//      offset      - the offset that is added to the baseNode to calculate the address to store into.
+//
+void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber src, GenTree* baseNode, unsigned offset)
+{
+    emitter* emit = getEmitter();
+
+    if (baseNode->OperIsLocalAddr())
+    {
+        if (baseNode->gtOper == GT_LCL_FLD_ADDR)
+        {
+            offset += baseNode->gtLclFld.gtLclOffs;
+        }
+
+        emit->emitIns_S_R(ins, size, src, baseNode->AsLclVarCommon()->GetLclNum(), offset);
+    }
+    else
+    {
+        emit->emitIns_AR_R(ins, size, src, baseNode->gtRegNum, offset);
+    }
+}
+
+// Generates CpBlk code by performing a loop unroll
+// Preconditions:
+//  The size argument of the CpBlk node is a constant and <= 64 bytes.
+//  This may seem small but covers >95% of the cases in several framework assemblies.
+//
+void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
+{
+    // Make sure we got the arguments of the cpblk operation in the right registers
+    unsigned   size    = cpBlkNode->Size();
+    GenTreePtr dstAddr = cpBlkNode->Addr();
+    GenTreePtr source  = cpBlkNode->Data();
+    GenTreePtr srcAddr = nullptr;
+    assert(size <= CPBLK_UNROLL_LIMIT);
+
+    emitter* emit = getEmitter();
+
+    if (source->gtOper == GT_IND)
+    {
+        srcAddr = source->gtGetOp1();
+        if (!srcAddr->isContained())
+        {
+            genConsumeReg(srcAddr);
+        }
+    }
+    else
+    {
+        noway_assert(source->IsLocal());
+        // TODO-Cleanup: Consider making the addrForm() method in Rationalize public, e.g. in GenTree.
+        // OR: transform source to GT_IND(GT_LCL_VAR_ADDR)
+        if (source->OperGet() == GT_LCL_VAR)
+        {
+            source->SetOper(GT_LCL_VAR_ADDR);
+        }
+        else
+        {
+            assert(source->OperGet() == GT_LCL_FLD);
+            source->SetOper(GT_LCL_FLD_ADDR);
+        }
+        srcAddr = source;
+    }
+
+    if (!dstAddr->isContained())
+    {
+        genConsumeReg(dstAddr);
+    }
+
+    unsigned offset = 0;
+
+    // If the size of this struct is larger than 16 bytes
+    // let's use SSE2 to be able to do 16 byte at a time
+    // loads and stores.
+
+    if (size >= XMM_REGSIZE_BYTES)
+    {
+        assert(cpBlkNode->gtRsvdRegs != RBM_NONE);
+        regNumber xmmReg = genRegNumFromMask(cpBlkNode->gtRsvdRegs & RBM_ALLFLOAT);
+        assert(genIsValidFloatReg(xmmReg));
+        size_t slots = size / XMM_REGSIZE_BYTES;
+
+        // TODO: In the below code the load and store instructions are for 16 bytes, but the
+        //       type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
+        //       this probably needs to be changed.
+        while (slots-- > 0)
+        {
+            // Load
+            genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmReg, srcAddr, offset);
+            // Store
+            genCodeForStoreOffset(INS_movdqu, EA_8BYTE, xmmReg, dstAddr, offset);
+            offset += XMM_REGSIZE_BYTES;
+        }
+    }
+
+    // Fill the remainder (15 bytes or less) if there's one.
+    if ((size & 0xf) != 0)
+    {
+        // Grab the integer temp register to emit the remaining loads and stores.
+        regNumber tmpReg = genRegNumFromMask(cpBlkNode->gtRsvdRegs & RBM_ALLINT);
+
+        if ((size & 8) != 0)
+        {
+#ifdef _TARGET_X86_
+            // TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
+            for (unsigned savedOffs = offset; offset < savedOffs + 8; offset += 4)
+            {
+                genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
+                genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
+            }
+#else  // !_TARGET_X86_
+            genCodeForLoadOffset(INS_mov, EA_8BYTE, tmpReg, srcAddr, offset);
+            genCodeForStoreOffset(INS_mov, EA_8BYTE, tmpReg, dstAddr, offset);
+            offset += 8;
+#endif // !_TARGET_X86_
+        }
+        if ((size & 4) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
+            genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
+            offset += 4;
+        }
+        if ((size & 2) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_2BYTE, tmpReg, srcAddr, offset);
+            genCodeForStoreOffset(INS_mov, EA_2BYTE, tmpReg, dstAddr, offset);
+            offset += 2;
+        }
+        if ((size & 1) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_1BYTE, tmpReg, srcAddr, offset);
+            genCodeForStoreOffset(INS_mov, EA_1BYTE, tmpReg, dstAddr, offset);
+        }
+    }
+}
+
+// Generate code for CpBlk by using rep movs
+// Preconditions:
+// The size argument of the CpBlk is a constant and is between
+// CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
+void CodeGen::genCodeForCpBlkRepMovs(GenTreeBlk* cpBlkNode)
+{
+    // Make sure we got the arguments of the cpblk operation in the right registers
+    unsigned   size    = cpBlkNode->Size();
+    GenTreePtr dstAddr = cpBlkNode->Addr();
+    GenTreePtr source  = cpBlkNode->Data();
+    GenTreePtr srcAddr = nullptr;
+
+#ifdef DEBUG
+    assert(!dstAddr->isContained());
+    assert(source->isContained());
+
+#ifdef _TARGET_X86_
+    if (size == 0)
+    {
+        noway_assert(cpBlkNode->OperGet() == GT_STORE_DYN_BLK);
+    }
+    else
+#endif
+    {
+#ifdef _TARGET_X64_
+        assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
+#else
+        assert(size > CPBLK_UNROLL_LIMIT);
+#endif
+    }
+#endif // DEBUG
+
+    genConsumeBlockOp(cpBlkNode, REG_RDI, REG_RSI, REG_RCX);
+    instGen(INS_r_movsb);
+}
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+//---------------------------------------------------------------------------------------------------------------//
+// genStructPutArgUnroll: Generates code for passing a struct arg on stack by value using loop unrolling.
+//
+// Arguments:
+//     putArgNode  - the PutArgStk tree.
+//     baseVarNum  - the base var number, relative to which the by-val struct will be copied on the stack.
+//
+// TODO-Amd64-Unix: Try to share code with copyblk.
+//      Need refactoring of copyblk before it could be used for putarg_stk.
+//      The difference for now is that a putarg_stk contains its children, while cpyblk does not.
+//      This creates differences in code. After some significant refactoring it could be reused.
+//
+void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode, unsigned baseVarNum)
+{
+    // We will never call this method for SIMD types, which are stored directly
+    // in genPutStructArgStk().
+    noway_assert(putArgNode->TypeGet() == TYP_STRUCT);
+
+    // Make sure we got the arguments of the cpblk operation in the right registers
+    GenTreePtr dstAddr = putArgNode;
+    GenTreePtr src     = putArgNode->gtOp.gtOp1;
+
+    size_t size = putArgNode->getArgSize();
+    assert(size <= CPBLK_UNROLL_LIMIT);
+
+    emitter* emit         = getEmitter();
+    unsigned putArgOffset = putArgNode->getArgOffset();
+
+    assert(src->isContained());
+
+    assert(src->gtOper == GT_OBJ);
+
+    if (!src->gtOp.gtOp1->isContained())
+    {
+        genConsumeReg(src->gtOp.gtOp1);
+    }
+
+    unsigned offset = 0;
+
+    // If the size of this struct is larger than 16 bytes
+    // let's use SSE2 to be able to do 16 byte at a time
+    // loads and stores.
+    if (size >= XMM_REGSIZE_BYTES)
+    {
+        assert(putArgNode->gtRsvdRegs != RBM_NONE);
+        regNumber xmmReg = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLFLOAT);
+        assert(genIsValidFloatReg(xmmReg));
+        size_t slots = size / XMM_REGSIZE_BYTES;
+
+        assert(putArgNode->gtGetOp1()->isContained());
+        assert(putArgNode->gtGetOp1()->gtOp.gtOper == GT_OBJ);
+
+        // TODO: In the below code the load and store instructions are for 16 bytes, but the
+        //          type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
+        //          this probably needs to be changed.
+        while (slots-- > 0)
+        {
+            // Load
+            genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmReg, src->gtGetOp1(),
+                                 offset); // Load the address of the child of the Obj node.
+
+            // Store
+            emit->emitIns_S_R(INS_movdqu, EA_8BYTE, xmmReg, baseVarNum, putArgOffset + offset);
+
+            offset += XMM_REGSIZE_BYTES;
+        }
+    }
+
+    // Fill the remainder (15 bytes or less) if there's one.
+    if ((size & 0xf) != 0)
+    {
+        // Grab the integer temp register to emit the remaining loads and stores.
+        regNumber tmpReg = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLINT);
+        assert(genIsValidIntReg(tmpReg));
+
+        if ((size & 8) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_8BYTE, tmpReg, src->gtOp.gtOp1, offset);
+
+            emit->emitIns_S_R(INS_mov, EA_8BYTE, tmpReg, baseVarNum, putArgOffset + offset);
+
+            offset += 8;
+        }
+
+        if ((size & 4) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, src->gtOp.gtOp1, offset);
+
+            emit->emitIns_S_R(INS_mov, EA_4BYTE, tmpReg, baseVarNum, putArgOffset + offset);
+
+            offset += 4;
+        }
+
+        if ((size & 2) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_2BYTE, tmpReg, src->gtOp.gtOp1, offset);
+
+            emit->emitIns_S_R(INS_mov, EA_2BYTE, tmpReg, baseVarNum, putArgOffset + offset);
+
+            offset += 2;
+        }
+
+        if ((size & 1) != 0)
+        {
+            genCodeForLoadOffset(INS_mov, EA_1BYTE, tmpReg, src->gtOp.gtOp1, offset);
+            emit->emitIns_S_R(INS_mov, EA_1BYTE, tmpReg, baseVarNum, putArgOffset + offset);
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// genStructPutArgRepMovs: Generates code for passing a struct arg by value on stack using Rep Movs.
+//
+// Arguments:
+//     putArgNode  - the PutArgStk tree.
+//     baseVarNum  - the base var number, relative to which the by-val struct bits will go.
+//
+// Preconditions:
+//     The size argument of the PutArgStk (for structs) is a constant and is between
+//     CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
+//
+void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode, unsigned baseVarNum)
+{
+    assert(putArgNode->TypeGet() == TYP_STRUCT);
+    assert(putArgNode->getArgSize() > CPBLK_UNROLL_LIMIT);
+    assert(baseVarNum != BAD_VAR_NUM);
+
+    // Make sure we got the arguments of the cpblk operation in the right registers
+    GenTreePtr dstAddr = putArgNode;
+    GenTreePtr srcAddr = putArgNode->gtGetOp1();
+
+    // Validate state.
+    assert(putArgNode->gtRsvdRegs == (RBM_RDI | RBM_RCX | RBM_RSI));
+    assert(srcAddr->isContained());
+
+    genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX, baseVarNum);
+    instGen(INS_r_movsb);
+}
+
+//------------------------------------------------------------------------
+// If any Vector3 args are on stack and they are not pass-by-ref, the upper 32bits
+// must be cleared to zeroes. The native compiler doesn't clear the upper bits
+// and there is no way to know if the caller is native or not. So, the upper
+// 32 bits of Vector argument on stack are always cleared to zero.
+#ifdef FEATURE_SIMD
+void CodeGen::genClearStackVec3ArgUpperBits()
+{
+#ifdef DEBUG
+    if (verbose)
+        printf("*************** In genClearStackVec3ArgUpperBits()\n");
+#endif
+
+    assert(compiler->compGeneratingProlog);
+
+    unsigned varNum = 0;
+
+    for (unsigned varNum = 0; varNum < compiler->info.compArgsCount; varNum++)
+    {
+        LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
+        assert(varDsc->lvIsParam);
+
+        // Does var has simd12 type?
+        if (varDsc->lvType != TYP_SIMD12)
+        {
+            continue;
+        }
+
+        if (!varDsc->lvIsRegArg)
+        {
+            // Clear the upper 32 bits by mov dword ptr [V_ARG_BASE+0xC], 0
+            getEmitter()->emitIns_S_I(ins_Store(TYP_INT), EA_4BYTE, varNum, genTypeSize(TYP_FLOAT) * 3, 0);
+        }
+        else
+        {
+            // Assume that for x64 linux, an argument is fully in registers
+            // or fully on stack.
+            regNumber argReg = varDsc->GetOtherArgReg();
+
+            // Clear the upper 32 bits by two shift instructions.
+            // argReg = argReg << 96
+            getEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
+            // argReg = argReg >> 96
+            getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
+        }
+    }
+}
+#endif // FEATURE_SIMD
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+// Generate code for CpObj nodes wich copy structs that have interleaved
+// GC pointers.
+// This will generate a sequence of movsq instructions for the cases of non-gc members
+// and calls to the BY_REF_ASSIGN helper otherwise.
+void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
+{
+    // Make sure we got the arguments of the cpobj operation in the right registers
+    GenTreePtr dstAddr       = cpObjNode->Addr();
+    GenTreePtr source        = cpObjNode->Data();
+    GenTreePtr srcAddr       = nullptr;
+    bool       sourceIsLocal = false;
+
+    assert(source->isContained());
+    if (source->gtOper == GT_IND)
+    {
+        srcAddr = source->gtGetOp1();
+        assert(!srcAddr->isContained());
+    }
+    else
+    {
+        noway_assert(source->IsLocal());
+        sourceIsLocal = true;
+        // TODO: Consider making the addrForm() method in Rationalize public, e.g. in GenTree.
+        // OR: transform source to GT_IND(GT_LCL_VAR_ADDR)
+        if (source->OperGet() == GT_LCL_VAR)
+        {
+            source->SetOper(GT_LCL_VAR_ADDR);
+        }
+        else
+        {
+            assert(source->OperGet() == GT_LCL_FLD);
+            source->SetOper(GT_LCL_FLD_ADDR);
+        }
+        srcAddr = source;
+    }
+
+    bool dstOnStack = dstAddr->OperIsLocalAddr();
+
+#ifdef DEBUG
+    bool isRepMovsqUsed = false;
+
+    assert(!dstAddr->isContained());
+
+    // If the GenTree node has data about GC pointers, this means we're dealing
+    // with CpObj, so this requires special logic.
+    assert(cpObjNode->gtGcPtrCount > 0);
+
+    // MovSq instruction is used for copying non-gcref fields and it needs
+    // src = RSI and dst = RDI.
+    // Either these registers must not contain lclVars, or they must be dying or marked for spill.
+    // This is because these registers are incremented as we go through the struct.
+    GenTree* actualSrcAddr    = srcAddr->gtSkipReloadOrCopy();
+    GenTree* actualDstAddr    = dstAddr->gtSkipReloadOrCopy();
+    unsigned srcLclVarNum     = BAD_VAR_NUM;
+    unsigned dstLclVarNum     = BAD_VAR_NUM;
+    bool     isSrcAddrLiveOut = false;
+    bool     isDstAddrLiveOut = false;
+    if (genIsRegCandidateLocal(actualSrcAddr))
+    {
+        srcLclVarNum     = actualSrcAddr->AsLclVarCommon()->gtLclNum;
+        isSrcAddrLiveOut = ((actualSrcAddr->gtFlags & (GTF_VAR_DEATH | GTF_SPILL)) == 0);
+    }
+    if (genIsRegCandidateLocal(actualDstAddr))
+    {
+        dstLclVarNum     = actualDstAddr->AsLclVarCommon()->gtLclNum;
+        isDstAddrLiveOut = ((actualDstAddr->gtFlags & (GTF_VAR_DEATH | GTF_SPILL)) == 0);
+    }
+    assert((actualSrcAddr->gtRegNum != REG_RSI) || !isSrcAddrLiveOut ||
+           ((srcLclVarNum == dstLclVarNum) && !isDstAddrLiveOut));
+    assert((actualDstAddr->gtRegNum != REG_RDI) || !isDstAddrLiveOut ||
+           ((srcLclVarNum == dstLclVarNum) && !isSrcAddrLiveOut));
+#endif // DEBUG
+
+    // Consume these registers.
+    // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
+    if (sourceIsLocal)
+    {
+        inst_RV_TT(INS_lea, REG_RSI, source, 0, EA_BYREF);
+        genConsumeBlockOp(cpObjNode, REG_RDI, REG_NA, REG_NA);
+    }
+    else
+    {
+        genConsumeBlockOp(cpObjNode, REG_RDI, REG_RSI, REG_NA);
+    }
+    gcInfo.gcMarkRegPtrVal(REG_RSI, srcAddr->TypeGet());
+    gcInfo.gcMarkRegPtrVal(REG_RDI, dstAddr->TypeGet());
+
+    unsigned slots = cpObjNode->gtSlots;
+
+    // If we can prove it's on the stack we don't need to use the write barrier.
+    if (dstOnStack)
+    {
+        if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
+        {
+#ifdef DEBUG
+            // If the destination of the CpObj is on the stack
+            // make sure we allocated RCX to emit rep movsq.
+            regNumber tmpReg = genRegNumFromMask(cpObjNode->gtRsvdRegs & RBM_ALLINT);
+            assert(tmpReg == REG_RCX);
+            isRepMovsqUsed = true;
+#endif // DEBUG
+
+            getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, slots);
+            instGen(INS_r_movsq);
+        }
+        else
+        {
+            // For small structs, it's better to emit a sequence of movsq than to
+            // emit a rep movsq instruction.
+            while (slots > 0)
+            {
+                instGen(INS_movsq);
+                slots--;
+            }
+        }
+    }
+    else
+    {
+        BYTE*    gcPtrs     = cpObjNode->gtGcPtrs;
+        unsigned gcPtrCount = cpObjNode->gtGcPtrCount;
+
+        unsigned i = 0;
+        while (i < slots)
+        {
+            switch (gcPtrs[i])
+            {
+                case TYPE_GC_NONE:
+                    // Let's see if we can use rep movsq instead of a sequence of movsq instructions
+                    // to save cycles and code size.
+                    {
+                        unsigned nonGcSlotCount = 0;
+
+                        do
+                        {
+                            nonGcSlotCount++;
+                            i++;
+                        } while (i < slots && gcPtrs[i] == TYPE_GC_NONE);
+
+                        // If we have a very small contiguous non-gc region, it's better just to
+                        // emit a sequence of movsq instructions
+                        if (nonGcSlotCount < CPOBJ_NONGC_SLOTS_LIMIT)
+                        {
+                            while (nonGcSlotCount > 0)
+                            {
+                                instGen(INS_movsq);
+                                nonGcSlotCount--;
+                            }
+                        }
+                        else
+                        {
+#ifdef DEBUG
+                            // Otherwise, we can save code-size and improve CQ by emitting
+                            // rep movsq
+                            regNumber tmpReg = genRegNumFromMask(cpObjNode->gtRsvdRegs & RBM_ALLINT);
+                            assert(tmpReg == REG_RCX);
+                            isRepMovsqUsed = true;
+#endif // DEBUG
+                            getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, nonGcSlotCount);
+                            instGen(INS_r_movsq);
+                        }
+                    }
+                    break;
+                default:
+                    // We have a GC pointer, call the memory barrier.
+                    genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE);
+                    gcPtrCount--;
+                    i++;
+            }
+        }
+
+        assert(gcPtrCount == 0);
+    }
+
+    // Clear the gcInfo for RSI and RDI.
+    // While we normally update GC info prior to the last instruction that uses them,
+    // these actually live into the helper call.
+    gcInfo.gcMarkRegSetNpt(RBM_RSI);
+    gcInfo.gcMarkRegSetNpt(RBM_RDI);
+}
+
+// Generate code for a CpBlk node by the means of the VM memcpy helper call
+// Preconditions:
+// a) The size argument of the CpBlk is not an integer constant
+// b) The size argument is a constant but is larger than CPBLK_MOVS_LIMIT bytes.
+void CodeGen::genCodeForCpBlk(GenTreeBlk* cpBlkNode)
+{
+#ifdef _TARGET_AMD64_
+    // Make sure we got the arguments of the cpblk operation in the right registers
+    unsigned   blockSize = cpBlkNode->Size();
+    GenTreePtr dstAddr   = cpBlkNode->Addr();
+    GenTreePtr source    = cpBlkNode->Data();
+    GenTreePtr srcAddr   = nullptr;
+
+    // Size goes in arg2
+    if (blockSize != 0)
+    {
+        assert(blockSize >= CPBLK_MOVS_LIMIT);
+        assert((cpBlkNode->gtRsvdRegs & RBM_ARG_2) != 0);
+    }
+    else
+    {
+        noway_assert(cpBlkNode->gtOper == GT_STORE_DYN_BLK);
+    }
+
+    // Source address goes in arg1
+    if (source->gtOper == GT_IND)
+    {
+        srcAddr = source->gtGetOp1();
+        assert(!srcAddr->isContained());
+    }
+    else
+    {
+        noway_assert(source->IsLocal());
+        assert((cpBlkNode->gtRsvdRegs & RBM_ARG_1) != 0);
+        inst_RV_TT(INS_lea, REG_ARG_1, source, 0, EA_BYREF);
+    }
+
+    genConsumeBlockOp(cpBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
+
+    genEmitHelperCall(CORINFO_HELP_MEMCPY, 0, EA_UNKNOWN);
+#else  // !_TARGET_AMD64_
+    noway_assert(false && "Helper call for CpBlk is not needed.");
+#endif // !_TARGET_AMD64_
+}
+
+// generate code do a switch statement based on a table of ip-relative offsets
+void CodeGen::genTableBasedSwitch(GenTree* treeNode)
+{
+    genConsumeOperands(treeNode->AsOp());
+    regNumber idxReg  = treeNode->gtOp.gtOp1->gtRegNum;
+    regNumber baseReg = treeNode->gtOp.gtOp2->gtRegNum;
+
+    regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+
+    // load the ip-relative offset (which is relative to start of fgFirstBB)
+    getEmitter()->emitIns_R_ARX(INS_mov, EA_4BYTE, baseReg, baseReg, idxReg, 4, 0);
+
+    // add it to the absolute address of fgFirstBB
+    compiler->fgFirstBB->bbFlags |= BBF_JMP_TARGET;
+    getEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, tmpReg);
+    getEmitter()->emitIns_R_R(INS_add, EA_PTRSIZE, baseReg, tmpReg);
+    // jmp baseReg
+    getEmitter()->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), baseReg);
+}
+
+// emits the table and an instruction to get the address of the first element
+void CodeGen::genJumpTable(GenTree* treeNode)
+{
+    noway_assert(compiler->compCurBB->bbJumpKind == BBJ_SWITCH);
+    assert(treeNode->OperGet() == GT_JMPTABLE);
+
+    unsigned     jumpCount = compiler->compCurBB->bbJumpSwt->bbsCount;
+    BasicBlock** jumpTable = compiler->compCurBB->bbJumpSwt->bbsDstTab;
+    unsigned     jmpTabOffs;
+    unsigned     jmpTabBase;
+
+    jmpTabBase = getEmitter()->emitBBTableDataGenBeg(jumpCount, true);
+
+    jmpTabOffs = 0;
+
+    JITDUMP("\n      J_M%03u_DS%02u LABEL   DWORD\n", Compiler::s_compMethodsCount, jmpTabBase);
+
+    for (unsigned i = 0; i < jumpCount; i++)
+    {
+        BasicBlock* target = *jumpTable++;
+        noway_assert(target->bbFlags & BBF_JMP_TARGET);
+
+        JITDUMP("            DD      L_M%03u_BB%02u\n", Compiler::s_compMethodsCount, target->bbNum);
+
+        getEmitter()->emitDataGenData(i, target);
+    };
+
+    getEmitter()->emitDataGenEnd();
+
+    // Access to inline data is 'abstracted' by a special type of static member
+    // (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
+    // to constant data, not a real static field.
+    getEmitter()->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), treeNode->gtRegNum,
+                              compiler->eeFindJitDataOffs(jmpTabBase), 0);
+    genProduceReg(treeNode);
+}
+
+// generate code for the locked operations:
+// GT_LOCKADD, GT_XCHG, GT_XADD
+void CodeGen::genLockedInstructions(GenTree* treeNode)
+{
+    GenTree*    data      = treeNode->gtOp.gtOp2;
+    GenTree*    addr      = treeNode->gtOp.gtOp1;
+    regNumber   targetReg = treeNode->gtRegNum;
+    regNumber   dataReg   = data->gtRegNum;
+    regNumber   addrReg   = addr->gtRegNum;
+    instruction ins;
+
+    // all of these nodes implicitly do an indirection on op1
+    // so create a temporary node to feed into the pattern matching
+    GenTreeIndir i = indirForm(data->TypeGet(), addr);
+    genConsumeReg(addr);
+
+    // The register allocator should have extended the lifetime of the address
+    // so that it is not used as the target.
+    noway_assert(addrReg != targetReg);
+
+    // If data is a lclVar that's not a last use, we'd better have allocated a register
+    // for the result (except in the case of GT_LOCKADD which does not produce a register result).
+    assert(targetReg != REG_NA || treeNode->OperGet() == GT_LOCKADD || !genIsRegCandidateLocal(data) ||
+           (data->gtFlags & GTF_VAR_DEATH) != 0);
+
+    genConsumeIfReg(data);
+    if (targetReg != REG_NA && dataReg != REG_NA && dataReg != targetReg)
+    {
+        inst_RV_RV(ins_Copy(data->TypeGet()), targetReg, dataReg);
+        data->gtRegNum = targetReg;
+
+        // TODO-XArch-Cleanup: Consider whether it is worth it, for debugging purposes, to restore the
+        // original gtRegNum on data, after calling emitInsBinary below.
+    }
+    switch (treeNode->OperGet())
+    {
+        case GT_LOCKADD:
+            instGen(INS_lock);
+            ins = INS_add;
+            break;
+        case GT_XCHG:
+            // lock is implied by xchg
+            ins = INS_xchg;
+            break;
+        case GT_XADD:
+            instGen(INS_lock);
+            ins = INS_xadd;
+            break;
+        default:
+            unreached();
+    }
+    getEmitter()->emitInsBinary(ins, emitTypeSize(data), &i, data);
+
+    if (treeNode->gtRegNum != REG_NA)
+    {
+        genProduceReg(treeNode);
+    }
+}
+
+// generate code for BoundsCheck nodes
+void CodeGen::genRangeCheck(GenTreePtr oper)
+{
+#ifdef FEATURE_SIMD
+    noway_assert(oper->OperGet() == GT_ARR_BOUNDS_CHECK || oper->OperGet() == GT_SIMD_CHK);
+#else  // !FEATURE_SIMD
+    noway_assert(oper->OperGet() == GT_ARR_BOUNDS_CHECK);
+#endif // !FEATURE_SIMD
+
+    GenTreeBoundsChk* bndsChk = oper->AsBoundsChk();
+
+    GenTreePtr arrLen    = bndsChk->gtArrLen;
+    GenTreePtr arrIndex  = bndsChk->gtIndex;
+    GenTreePtr arrRef    = nullptr;
+    int        lenOffset = 0;
+
+    GenTree *    src1, *src2;
+    emitJumpKind jmpKind;
+
+    genConsumeRegs(arrLen);
+    genConsumeRegs(arrIndex);
+
+    if (arrIndex->isContainedIntOrIImmed())
+    {
+        // arrIndex is a contained constant.  In this case
+        // we will generate one of the following
+        //      cmp [mem], immed    (if arrLen is a memory op)
+        //      cmp reg, immed      (if arrLen is in a reg)
+        //
+        // That is arrLen cannot be a contained immed.
+        assert(!arrLen->isContainedIntOrIImmed());
+
+        src1    = arrLen;
+        src2    = arrIndex;
+        jmpKind = EJ_jbe;
+    }
+    else
+    {
+        // arrIndex could either be a contained memory op or a reg
+        // In this case we will generate one of the following
+        //      cmp  [mem], immed   (if arrLen is a constant)
+        //      cmp  [mem], reg     (if arrLen is in a reg)
+        //      cmp  reg, immed     (if arrIndex is in a reg)
+        //      cmp  reg1, reg2     (if arraIndex is in reg1)
+        //      cmp  reg, [mem]     (if arrLen is a memory op)
+        //
+        // That is only one of arrIndex or arrLen can be a memory op.
+        assert(!arrIndex->isContainedMemoryOp() || !arrLen->isContainedMemoryOp());
+
+        src1    = arrIndex;
+        src2    = arrLen;
+        jmpKind = EJ_jae;
+    }
+
+    var_types bndsChkType = src2->TypeGet();
+#if DEBUG
+    // Bounds checks can only be 32 or 64 bit sized comparisons.
+    assert(bndsChkType == TYP_INT || bndsChkType == TYP_LONG);
+
+    // The type of the bounds check should always wide enough to compare against the index.
+    assert(emitTypeSize(bndsChkType) >= emitTypeSize(src1->TypeGet()));
+#endif // DEBUG
+
+    getEmitter()->emitInsBinary(INS_cmp, emitTypeSize(bndsChkType), src1, src2);
+    genJumpToThrowHlpBlk(jmpKind, bndsChk->gtThrowKind, bndsChk->gtIndRngFailBB);
+}
+
+//------------------------------------------------------------------------
+// genOffsetOfMDArrayLowerBound: Returns the offset from the Array object to the
+//   lower bound for the given dimension.
+//
+// Arguments:
+//    elemType  - the element type of the array
+//    rank      - the rank of the array
+//    dimension - the dimension for which the lower bound offset will be returned.
+//
+// Return Value:
+//    The offset.
+
+unsigned CodeGen::genOffsetOfMDArrayLowerBound(var_types elemType, unsigned rank, unsigned dimension)
+{
+    // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
+    return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * (dimension + rank);
+}
+
+//------------------------------------------------------------------------
+// genOffsetOfMDArrayLength: Returns the offset from the Array object to the
+//   size for the given dimension.
+//
+// Arguments:
+//    elemType  - the element type of the array
+//    rank      - the rank of the array
+//    dimension - the dimension for which the lower bound offset will be returned.
+//
+// Return Value:
+//    The offset.
+
+unsigned CodeGen::genOffsetOfMDArrayDimensionSize(var_types elemType, unsigned rank, unsigned dimension)
+{
+    // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
+    return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * dimension;
+}
+
+//------------------------------------------------------------------------
+// genCodeForArrIndex: Generates code to bounds check the index for one dimension of an array reference,
+//                     producing the effective index by subtracting the lower bound.
+//
+// Arguments:
+//    arrIndex - the node for which we're generating code
+//
+// Return Value:
+//    None.
+//
+
+void CodeGen::genCodeForArrIndex(GenTreeArrIndex* arrIndex)
+{
+    GenTreePtr arrObj    = arrIndex->ArrObj();
+    GenTreePtr indexNode = arrIndex->IndexExpr();
+
+    regNumber arrReg   = genConsumeReg(arrObj);
+    regNumber indexReg = genConsumeReg(indexNode);
+    regNumber tgtReg   = arrIndex->gtRegNum;
+
+    unsigned  dim      = arrIndex->gtCurrDim;
+    unsigned  rank     = arrIndex->gtArrRank;
+    var_types elemType = arrIndex->gtArrElemType;
+
+    noway_assert(tgtReg != REG_NA);
+
+    // Subtract the lower bound for this dimension.
+    // TODO-XArch-CQ: make this contained if it's an immediate that fits.
+    if (tgtReg != indexReg)
+    {
+        inst_RV_RV(INS_mov, tgtReg, indexReg, indexNode->TypeGet());
+    }
+    getEmitter()->emitIns_R_AR(INS_sub, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
+                               genOffsetOfMDArrayLowerBound(elemType, rank, dim));
+    getEmitter()->emitIns_R_AR(INS_cmp, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
+                               genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
+    genJumpToThrowHlpBlk(EJ_jae, SCK_RNGCHK_FAIL);
+
+    genProduceReg(arrIndex);
+}
+
+//------------------------------------------------------------------------
+// genCodeForArrOffset: Generates code to compute the flattened array offset for
+//    one dimension of an array reference:
+//        result = (prevDimOffset * dimSize) + effectiveIndex
+//    where dimSize is obtained from the arrObj operand
+//
+// Arguments:
+//    arrOffset - the node for which we're generating code
+//
+// Return Value:
+//    None.
+//
+// Notes:
+//    dimSize and effectiveIndex are always non-negative, the former by design,
+//    and the latter because it has been normalized to be zero-based.
+
+void CodeGen::genCodeForArrOffset(GenTreeArrOffs* arrOffset)
+{
+    GenTreePtr offsetNode = arrOffset->gtOffset;
+    GenTreePtr indexNode  = arrOffset->gtIndex;
+    GenTreePtr arrObj     = arrOffset->gtArrObj;
+
+    regNumber tgtReg = arrOffset->gtRegNum;
+
+    noway_assert(tgtReg != REG_NA);
+
+    unsigned  dim      = arrOffset->gtCurrDim;
+    unsigned  rank     = arrOffset->gtArrRank;
+    var_types elemType = arrOffset->gtArrElemType;
+
+    // We will use a temp register for the offset*scale+effectiveIndex computation.
+    regMaskTP tmpRegMask = arrOffset->gtRsvdRegs;
+    regNumber tmpReg     = genRegNumFromMask(tmpRegMask);
+
+    // First, consume the operands in the correct order.
+    regNumber offsetReg = REG_NA;
+    if (!offsetNode->IsIntegralConst(0))
+    {
+        offsetReg = genConsumeReg(offsetNode);
+    }
+    else
+    {
+        assert(offsetNode->isContained());
+    }
+    regNumber indexReg = genConsumeReg(indexNode);
+    // Although arrReg may not be used in the constant-index case, if we have generated
+    // the value into a register, we must consume it, otherwise we will fail to end the
+    // live range of the gc ptr.
+    // TODO-CQ: Currently arrObj will always have a register allocated to it.
+    // We could avoid allocating a register for it, which would be of value if the arrObj
+    // is an on-stack lclVar.
+    regNumber arrReg = REG_NA;
+    if (arrObj->gtHasReg())
+    {
+        arrReg = genConsumeReg(arrObj);
+    }
+
+    if (!offsetNode->IsIntegralConst(0))
+    {
+        // Evaluate tgtReg = offsetReg*dim_size + indexReg.
+        // tmpReg is used to load dim_size and the result of the multiplication.
+        // Note that dim_size will never be negative.
+
+        getEmitter()->emitIns_R_AR(INS_mov, emitActualTypeSize(TYP_INT), tmpReg, arrReg,
+                                   genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
+        inst_RV_RV(INS_imul, tmpReg, offsetReg);
+
+        if (tmpReg == tgtReg)
+        {
+            inst_RV_RV(INS_add, tmpReg, indexReg);
+        }
+        else
+        {
+            if (indexReg != tgtReg)
+            {
+                inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_I_IMPL);
+            }
+            inst_RV_RV(INS_add, tgtReg, tmpReg);
+        }
+    }
+    else
+    {
+        if (indexReg != tgtReg)
+        {
+            inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_INT);
+        }
+    }
+    genProduceReg(arrOffset);
+}
+
+// make a temporary indir we can feed to pattern matching routines
+// in cases where we don't want to instantiate all the indirs that happen
+//
+GenTreeIndir CodeGen::indirForm(var_types type, GenTree* base)
+{
+    GenTreeIndir i(GT_IND, type, base, nullptr);
+    i.gtRegNum = REG_NA;
+    // has to be nonnull (because contained nodes can't be the last in block)
+    // but don't want it to be a valid pointer
+    i.gtNext = (GenTree*)(-1);
+    return i;
+}
+
+// make a temporary int we can feed to pattern matching routines
+// in cases where we don't want to instantiate
+//
+GenTreeIntCon CodeGen::intForm(var_types type, ssize_t value)
+{
+    GenTreeIntCon i(type, value);
+    i.gtRegNum = REG_NA;
+    // has to be nonnull (because contained nodes can't be the last in block)
+    // but don't want it to be a valid pointer
+    i.gtNext = (GenTree*)(-1);
+    return i;
+}
+
+instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type)
+{
+    instruction ins;
+
+    // Operations on SIMD vectors shouldn't come this path
+    assert(!varTypeIsSIMD(type));
+    if (varTypeIsFloating(type))
+    {
+        return ins_MathOp(oper, type);
+    }
+
+    switch (oper)
+    {
+        case GT_ADD:
+            ins = INS_add;
+            break;
+        case GT_AND:
+            ins = INS_and;
+            break;
+        case GT_LSH:
+            ins = INS_shl;
+            break;
+        case GT_MUL:
+            ins = INS_imul;
+            break;
+        case GT_NEG:
+            ins = INS_neg;
+            break;
+        case GT_NOT:
+            ins = INS_not;
+            break;
+        case GT_OR:
+            ins = INS_or;
+            break;
+        case GT_ROL:
+            ins = INS_rol;
+            break;
+        case GT_ROR:
+            ins = INS_ror;
+            break;
+        case GT_RSH:
+            ins = INS_sar;
+            break;
+        case GT_RSZ:
+            ins = INS_shr;
+            break;
+        case GT_SUB:
+            ins = INS_sub;
+            break;
+        case GT_XOR:
+            ins = INS_xor;
+            break;
+#if !defined(_TARGET_64BIT_)
+        case GT_ADD_LO:
+            ins = INS_add;
+            break;
+        case GT_ADD_HI:
+            ins = INS_adc;
+            break;
+        case GT_SUB_LO:
+            ins = INS_sub;
+            break;
+        case GT_SUB_HI:
+            ins = INS_sbb;
+            break;
+#endif // !defined(_TARGET_64BIT_)
+        default:
+            unreached();
+            break;
+    }
+    return ins;
+}
+
+//------------------------------------------------------------------------
+// genCodeForShift: Generates the code sequence for a GenTree node that
+// represents a bit shift or rotate operation (<<, >>, >>>, rol, ror).
+//
+// Arguments:
+//    tree - the bit shift node (that specifies the type of bit shift to perform).
+//
+// Assumptions:
+//    a) All GenTrees are register allocated.
+//    b) The shift-by-amount in tree->gtOp.gtOp2 is either a contained constant or
+//       it's a register-allocated expression. If it is in a register that is
+//       not RCX, it will be moved to RCX (so RCX better not be in use!).
+//
+void CodeGen::genCodeForShift(GenTreePtr tree)
+{
+    // Only the non-RMW case here.
+    assert(tree->OperIsShiftOrRotate());
+    assert(!tree->gtOp.gtOp1->isContained());
+    assert(tree->gtRegNum != REG_NA);
+
+    genConsumeOperands(tree->AsOp());
+
+    var_types   targetType = tree->TypeGet();
+    instruction ins        = genGetInsForOper(tree->OperGet(), targetType);
+
+    GenTreePtr operand    = tree->gtGetOp1();
+    regNumber  operandReg = operand->gtRegNum;
+
+    GenTreePtr shiftBy = tree->gtGetOp2();
+    if (shiftBy->isContainedIntOrIImmed())
+    {
+        // First, move the operand to the destination register and
+        // later on perform the shift in-place.
+        // (LSRA will try to avoid this situation through preferencing.)
+        if (tree->gtRegNum != operandReg)
+        {
+            inst_RV_RV(INS_mov, tree->gtRegNum, operandReg, targetType);
+        }
+
+        int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
+        inst_RV_SH(ins, emitTypeSize(tree), tree->gtRegNum, shiftByValue);
+    }
+    else
+    {
+        // We must have the number of bits to shift stored in ECX, since we constrained this node to
+        // sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
+        // register destination requirement.
+        regNumber shiftReg = shiftBy->gtRegNum;
+        if (shiftReg != REG_RCX)
+        {
+            // Issue the mov to RCX:
+            inst_RV_RV(INS_mov, REG_RCX, shiftReg, shiftBy->TypeGet());
+        }
+
+        // The operand to be shifted must not be in ECX
+        noway_assert(operandReg != REG_RCX);
+
+        if (tree->gtRegNum != operandReg)
+        {
+            inst_RV_RV(INS_mov, tree->gtRegNum, operandReg, targetType);
+        }
+        inst_RV_CL(ins, tree->gtRegNum, targetType);
+    }
+
+    genProduceReg(tree);
+}
+
+//------------------------------------------------------------------------
+// genCodeForShiftRMW: Generates the code sequence for a GT_STOREIND GenTree node that
+// represents a RMW bit shift or rotate operation (<<, >>, >>>, rol, ror), for example:
+//      GT_STOREIND( AddressTree, GT_SHL( Ind ( AddressTree ), Operand ) )
+//
+// Arguments:
+//    storeIndNode: the GT_STOREIND node.
+//
+void CodeGen::genCodeForShiftRMW(GenTreeStoreInd* storeInd)
+{
+    GenTree* data = storeInd->Data();
+    GenTree* addr = storeInd->Addr();
+
+    assert(data->OperIsShiftOrRotate());
+
+    // This function only handles the RMW case.
+    assert(data->gtOp.gtOp1->isContained());
+    assert(data->gtOp.gtOp1->isIndir());
+    assert(Lowering::IndirsAreEquivalent(data->gtOp.gtOp1, storeInd));
+    assert(data->gtRegNum == REG_NA);
+
+    var_types   targetType = data->TypeGet();
+    genTreeOps  oper       = data->OperGet();
+    instruction ins        = genGetInsForOper(oper, targetType);
+    emitAttr    attr       = EA_ATTR(genTypeSize(targetType));
+
+    GenTree* shiftBy = data->gtOp.gtOp2;
+    if (shiftBy->isContainedIntOrIImmed())
+    {
+        int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
+        ins              = genMapShiftInsToShiftByConstantIns(ins, shiftByValue);
+        if (shiftByValue == 1)
+        {
+            // There is no source in this case, as the shift by count is embedded in the instruction opcode itself.
+            getEmitter()->emitInsRMW(ins, attr, storeInd);
+        }
+        else
+        {
+            getEmitter()->emitInsRMW(ins, attr, storeInd, shiftBy);
+        }
+    }
+    else
+    {
+        // We must have the number of bits to shift stored in ECX, since we constrained this node to
+        // sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
+        // register destination requirement.
+        regNumber shiftReg = shiftBy->gtRegNum;
+        if (shiftReg != REG_RCX)
+        {
+            // Issue the mov to RCX:
+            inst_RV_RV(INS_mov, REG_RCX, shiftReg, shiftBy->TypeGet());
+        }
+
+        // The shiftBy operand is implicit, so call the unary version of emitInsRMW.
+        getEmitter()->emitInsRMW(ins, attr, storeInd);
+    }
+}
+
+void CodeGen::genUnspillRegIfNeeded(GenTree* tree)
+{
+    regNumber dstReg      = tree->gtRegNum;
+    GenTree*  unspillTree = tree;
+
+    if (tree->gtOper == GT_RELOAD)
+    {
+        unspillTree = tree->gtOp.gtOp1;
+    }
+
+    if ((unspillTree->gtFlags & GTF_SPILLED) != 0)
+    {
+        if (genIsRegCandidateLocal(unspillTree))
+        {
+            // Reset spilled flag, since we are going to load a local variable from its home location.
+            unspillTree->gtFlags &= ~GTF_SPILLED;
+
+            GenTreeLclVarCommon* lcl    = unspillTree->AsLclVarCommon();
+            LclVarDsc*           varDsc = &compiler->lvaTable[lcl->gtLclNum];
+
+            // Load local variable from its home location.
+            // In most cases the tree type will indicate the correct type to use for the load.
+            // However, if it is NOT a normalizeOnLoad lclVar (i.e. NOT a small int that always gets
+            // widened when loaded into a register), and its size is not the same as genActualType of
+            // the type of the lclVar, then we need to change the type of the tree node when loading.
+            // This situation happens due to "optimizations" that avoid a cast and
+            // simply retype the node when using long type lclVar as an int.
+            // While loading the int in that case would work for this use of the lclVar, if it is
+            // later used as a long, we will have incorrectly truncated the long.
+            // In the normalizeOnLoad case ins_Load will return an appropriate sign- or zero-
+            // extending load.
+
+            var_types treeType = unspillTree->TypeGet();
+            if (treeType != genActualType(varDsc->lvType) && !varTypeIsGC(treeType) && !varDsc->lvNormalizeOnLoad())
+            {
+                assert(!varTypeIsGC(varDsc));
+                var_types spillType = genActualType(varDsc->lvType);
+                unspillTree->gtType = spillType;
+                inst_RV_TT(ins_Load(spillType, compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)), dstReg, unspillTree);
+                unspillTree->gtType = treeType;
+            }
+            else
+            {
+                inst_RV_TT(ins_Load(treeType, compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)), dstReg, unspillTree);
+            }
+
+            unspillTree->SetInReg();
+
+            // TODO-Review: We would like to call:
+            //      genUpdateRegLife(varDsc, /*isBorn*/ true, /*isDying*/ false DEBUGARG(tree));
+            // instead of the following code, but this ends up hitting this assert:
+            //      assert((regSet.rsMaskVars & regMask) == 0);
+            // due to issues with LSRA resolution moves.
+            // So, just force it for now. This probably indicates a condition that creates a GC hole!
+            //
+            // Extra note: I think we really want to call something like gcInfo.gcUpdateForRegVarMove,
+            // because the variable is not really going live or dead, but that method is somewhat poorly
+            // factored because it, in turn, updates rsMaskVars which is part of RegSet not GCInfo.
+            // TODO-Cleanup: This code exists in other CodeGen*.cpp files, and should be moved to CodeGenCommon.cpp.
+
+            // Don't update the variable's location if we are just re-spilling it again.
+
+            if ((unspillTree->gtFlags & GTF_SPILL) == 0)
+            {
+                genUpdateVarReg(varDsc, tree);
+#ifdef DEBUG
+                if (VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
+                {
+                    JITDUMP("\t\t\t\t\t\t\tRemoving V%02u from gcVarPtrSetCur\n", lcl->gtLclNum);
+                }
+#endif // DEBUG
+                VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+
+#ifdef DEBUG
+                if (compiler->verbose)
+                {
+                    printf("\t\t\t\t\t\t\tV%02u in reg ", lcl->gtLclNum);
+                    varDsc->PrintVarReg();
+                    printf(" is becoming live  ");
+                    compiler->printTreeID(unspillTree);
+                    printf("\n");
+                }
+#endif // DEBUG
+
+                regSet.AddMaskVars(genGetRegMask(varDsc));
+            }
+
+            gcInfo.gcMarkRegPtrVal(dstReg, unspillTree->TypeGet());
+        }
+        else if (unspillTree->IsMultiRegCall())
+        {
+            GenTreeCall*         call        = unspillTree->AsCall();
+            ReturnTypeDesc*      retTypeDesc = call->GetReturnTypeDesc();
+            unsigned             regCount    = retTypeDesc->GetReturnRegCount();
+            GenTreeCopyOrReload* reloadTree  = nullptr;
+            if (tree->OperGet() == GT_RELOAD)
+            {
+                reloadTree = tree->AsCopyOrReload();
+            }
+
+            // In case of multi-reg call node, GTF_SPILLED flag on it indicates that
+            // one or more of its result regs are spilled.  Call node needs to be
+            // queried to know which specific result regs to be unspilled.
+            for (unsigned i = 0; i < regCount; ++i)
+            {
+                unsigned flags = call->GetRegSpillFlagByIdx(i);
+                if ((flags & GTF_SPILLED) != 0)
+                {
+                    var_types dstType        = retTypeDesc->GetReturnRegType(i);
+                    regNumber unspillTreeReg = call->GetRegNumByIdx(i);
+
+                    if (reloadTree != nullptr)
+                    {
+                        dstReg = reloadTree->GetRegNumByIdx(i);
+                        if (dstReg == REG_NA)
+                        {
+                            dstReg = unspillTreeReg;
+                        }
+                    }
+                    else
+                    {
+                        dstReg = unspillTreeReg;
+                    }
+
+                    TempDsc* t = regSet.rsUnspillInPlace(call, unspillTreeReg, i);
+                    getEmitter()->emitIns_R_S(ins_Load(dstType), emitActualTypeSize(dstType), dstReg, t->tdTempNum(),
+                                              0);
+                    compiler->tmpRlsTemp(t);
+                    gcInfo.gcMarkRegPtrVal(dstReg, dstType);
+                }
+            }
+
+            unspillTree->gtFlags &= ~GTF_SPILLED;
+            unspillTree->SetInReg();
+        }
+        else
+        {
+            TempDsc* t = regSet.rsUnspillInPlace(unspillTree, unspillTree->gtRegNum);
+            getEmitter()->emitIns_R_S(ins_Load(unspillTree->gtType), emitActualTypeSize(unspillTree->TypeGet()), dstReg,
+                                      t->tdTempNum(), 0);
+            compiler->tmpRlsTemp(t);
+
+            unspillTree->gtFlags &= ~GTF_SPILLED;
+            unspillTree->SetInReg();
+            gcInfo.gcMarkRegPtrVal(dstReg, unspillTree->TypeGet());
+        }
+    }
+}
+
+// Do Liveness update for a subnodes that is being consumed by codegen
+// including the logic for reload in case is needed and also takes care
+// of locating the value on the desired register.
+void CodeGen::genConsumeRegAndCopy(GenTree* tree, regNumber needReg)
+{
+    if (needReg == REG_NA)
+    {
+        return;
+    }
+    regNumber treeReg = genConsumeReg(tree);
+    if (treeReg != needReg)
+    {
+        inst_RV_RV(INS_mov, needReg, treeReg, tree->TypeGet());
+    }
+}
+
+void CodeGen::genRegCopy(GenTree* treeNode)
+{
+    assert(treeNode->OperGet() == GT_COPY);
+    GenTree* op1 = treeNode->gtOp.gtOp1;
+
+    if (op1->IsMultiRegCall())
+    {
+        genConsumeReg(op1);
+
+        GenTreeCopyOrReload* copyTree    = treeNode->AsCopyOrReload();
+        GenTreeCall*         call        = op1->AsCall();
+        ReturnTypeDesc*      retTypeDesc = call->GetReturnTypeDesc();
+        unsigned             regCount    = retTypeDesc->GetReturnRegCount();
+
+        for (unsigned i = 0; i < regCount; ++i)
+        {
+            var_types type    = retTypeDesc->GetReturnRegType(i);
+            regNumber fromReg = call->GetRegNumByIdx(i);
+            regNumber toReg   = copyTree->GetRegNumByIdx(i);
+
+            // A Multi-reg GT_COPY node will have valid reg only for those
+            // positions that corresponding result reg of call node needs
+            // to be copied.
+            if (toReg != REG_NA)
+            {
+                assert(toReg != fromReg);
+                inst_RV_RV(ins_Copy(type), toReg, fromReg, type);
+            }
+        }
+    }
+    else
+    {
+        var_types targetType = treeNode->TypeGet();
+        regNumber targetReg  = treeNode->gtRegNum;
+        assert(targetReg != REG_NA);
+
+        // Check whether this node and the node from which we're copying the value have
+        // different register types. This can happen if (currently iff) we have a SIMD
+        // vector type that fits in an integer register, in which case it is passed as
+        // an argument, or returned from a call, in an integer register and must be
+        // copied if it's in an xmm register.
+
+        bool srcFltReg = (varTypeIsFloating(op1) || varTypeIsSIMD(op1));
+        bool tgtFltReg = (varTypeIsFloating(treeNode) || varTypeIsSIMD(treeNode));
+        if (srcFltReg != tgtFltReg)
+        {
+            instruction ins;
+            regNumber   fpReg;
+            regNumber   intReg;
+            if (tgtFltReg)
+            {
+                ins    = ins_CopyIntToFloat(op1->TypeGet(), treeNode->TypeGet());
+                fpReg  = targetReg;
+                intReg = op1->gtRegNum;
+            }
+            else
+            {
+                ins    = ins_CopyFloatToInt(op1->TypeGet(), treeNode->TypeGet());
+                intReg = targetReg;
+                fpReg  = op1->gtRegNum;
+            }
+            inst_RV_RV(ins, fpReg, intReg, targetType);
+        }
+        else
+        {
+            inst_RV_RV(ins_Copy(targetType), targetReg, genConsumeReg(op1), targetType);
+        }
+
+        if (op1->IsLocal())
+        {
+            // The lclVar will never be a def.
+            // If it is a last use, the lclVar will be killed by genConsumeReg(), as usual, and genProduceReg will
+            // appropriately set the gcInfo for the copied value.
+            // If not, there are two cases we need to handle:
+            // - If this is a TEMPORARY copy (indicated by the GTF_VAR_DEATH flag) the variable
+            //   will remain live in its original register.
+            //   genProduceReg() will appropriately set the gcInfo for the copied value,
+            //   and genConsumeReg will reset it.
+            // - Otherwise, we need to update register info for the lclVar.
+
+            GenTreeLclVarCommon* lcl = op1->AsLclVarCommon();
+            assert((lcl->gtFlags & GTF_VAR_DEF) == 0);
+
+            if ((lcl->gtFlags & GTF_VAR_DEATH) == 0 && (treeNode->gtFlags & GTF_VAR_DEATH) == 0)
+            {
+                LclVarDsc* varDsc = &compiler->lvaTable[lcl->gtLclNum];
+
+                // If we didn't just spill it (in genConsumeReg, above), then update the register info
+                if (varDsc->lvRegNum != REG_STK)
+                {
+                    // The old location is dying
+                    genUpdateRegLife(varDsc, /*isBorn*/ false, /*isDying*/ true DEBUGARG(op1));
+
+                    gcInfo.gcMarkRegSetNpt(genRegMask(op1->gtRegNum));
+
+                    genUpdateVarReg(varDsc, treeNode);
+
+                    // The new location is going live
+                    genUpdateRegLife(varDsc, /*isBorn*/ true, /*isDying*/ false DEBUGARG(treeNode));
+                }
+            }
+        }
+    }
+
+    genProduceReg(treeNode);
+}
+
+// Check that registers are consumed in the right order for the current node being generated.
+#ifdef DEBUG
+void CodeGen::genCheckConsumeNode(GenTree* treeNode)
+{
+    // GT_PUTARG_REG is consumed out of order.
+    if (treeNode->gtSeqNum != 0 && treeNode->OperGet() != GT_PUTARG_REG)
+    {
+        if (lastConsumedNode != nullptr)
+        {
+            if (treeNode == lastConsumedNode)
+            {
+                if (verbose)
+                {
+                    printf("Node was consumed twice:\n    ");
+                    compiler->gtDispTree(treeNode, nullptr, nullptr, true);
+                }
+            }
+            else
+            {
+                if (verbose && (lastConsumedNode->gtSeqNum > treeNode->gtSeqNum))
+                {
+                    printf("Nodes were consumed out-of-order:\n");
+                    compiler->gtDispTree(lastConsumedNode, nullptr, nullptr, true);
+                    compiler->gtDispTree(treeNode, nullptr, nullptr, true);
+                }
+                // assert(lastConsumedNode->gtSeqNum < treeNode->gtSeqNum);
+            }
+        }
+        lastConsumedNode = treeNode;
+    }
+}
+#endif // DEBUG
+
+//--------------------------------------------------------------------
+// genConsumeReg: Do liveness update for a subnode that is being
+// consumed by codegen.
+//
+// Arguments:
+//    tree - GenTree node
+//
+// Return Value:
+//    Returns the reg number of tree.
+//    In case of multi-reg call node returns the first reg number
+//    of the multi-reg return.
+regNumber CodeGen::genConsumeReg(GenTree* tree)
+{
+    if (tree->OperGet() == GT_COPY)
+    {
+        genRegCopy(tree);
+    }
+
+    // Handle the case where we have a lclVar that needs to be copied before use (i.e. because it
+    // interferes with one of the other sources (or the target, if it's a "delayed use" register)).
+    // TODO-Cleanup: This is a special copyReg case in LSRA - consider eliminating these and
+    // always using GT_COPY to make the lclVar location explicit.
+    // Note that we have to do this before calling genUpdateLife because otherwise if we spill it
+    // the lvRegNum will be set to REG_STK and we will lose track of what register currently holds
+    // the lclVar (normally when a lclVar is spilled it is then used from its former register
+    // location, which matches the gtRegNum on the node).
+    // (Note that it doesn't matter if we call this before or after genUnspillRegIfNeeded
+    // because if it's on the stack it will always get reloaded into tree->gtRegNum).
+    if (genIsRegCandidateLocal(tree))
+    {
+        GenTreeLclVarCommon* lcl    = tree->AsLclVarCommon();
+        LclVarDsc*           varDsc = &compiler->lvaTable[lcl->GetLclNum()];
+        if (varDsc->lvRegNum != REG_STK && varDsc->lvRegNum != tree->gtRegNum)
+        {
+            inst_RV_RV(INS_mov, tree->gtRegNum, varDsc->lvRegNum);
+        }
+    }
+
+    genUnspillRegIfNeeded(tree);
+
+    // genUpdateLife() will also spill local var if marked as GTF_SPILL by calling CodeGen::genSpillVar
+    genUpdateLife(tree);
+
+    assert(tree->gtHasReg());
+
+    // there are three cases where consuming a reg means clearing the bit in the live mask
+    // 1. it was not produced by a local
+    // 2. it was produced by a local that is going dead
+    // 3. it was produced by a local that does not live in that reg (like one allocated on the stack)
+
+    if (genIsRegCandidateLocal(tree))
+    {
+        GenTreeLclVarCommon* lcl    = tree->AsLclVarCommon();
+        LclVarDsc*           varDsc = &compiler->lvaTable[lcl->GetLclNum()];
+        assert(varDsc->lvLRACandidate);
+
+        if ((tree->gtFlags & GTF_VAR_DEATH) != 0)
+        {
+            gcInfo.gcMarkRegSetNpt(genRegMask(varDsc->lvRegNum));
+        }
+        else if (varDsc->lvRegNum == REG_STK)
+        {
+            // We have loaded this into a register only temporarily
+            gcInfo.gcMarkRegSetNpt(genRegMask(tree->gtRegNum));
+        }
+    }
+    else
+    {
+        gcInfo.gcMarkRegSetNpt(tree->gtGetRegMask());
+    }
+
+    genCheckConsumeNode(tree);
+    return tree->gtRegNum;
+}
+
+// Do liveness update for an address tree: one of GT_LEA, GT_LCL_VAR, or GT_CNS_INT (for call indirect).
+void CodeGen::genConsumeAddress(GenTree* addr)
+{
+    if (!addr->isContained())
+    {
+        genConsumeReg(addr);
+    }
+    else if (addr->OperGet() == GT_LEA)
+    {
+        genConsumeAddrMode(addr->AsAddrMode());
+    }
+}
+
+// do liveness update for a subnode that is being consumed by codegen
+void CodeGen::genConsumeAddrMode(GenTreeAddrMode* addr)
+{
+    genConsumeOperands(addr);
+}
+
+void CodeGen::genConsumeRegs(GenTree* tree)
+{
+#if !defined(_TARGET_64BIT_)
+    if (tree->OperGet() == GT_LONG)
+    {
+        genConsumeRegs(tree->gtGetOp1());
+        genConsumeRegs(tree->gtGetOp2());
+        return;
+    }
+#endif // !defined(_TARGET_64BIT_)
+
+    if (tree->isContained())
+    {
+        if (tree->isContainedSpillTemp())
+        {
+            // spill temps are un-tracked and hence no need to update life
+        }
+        else if (tree->isIndir())
+        {
+            genConsumeAddress(tree->AsIndir()->Addr());
+        }
+        else if (tree->OperGet() == GT_AND)
+        {
+            // This is the special contained GT_AND that we created in Lowering::LowerCmp()
+            // Now we need to consume the operands of the GT_AND node.
+            genConsumeOperands(tree->AsOp());
+        }
+        else if (tree->OperGet() == GT_LCL_VAR)
+        {
+            // A contained lcl var must be living on stack and marked as reg optional.
+            unsigned   varNum = tree->AsLclVarCommon()->GetLclNum();
+            LclVarDsc* varDsc = compiler->lvaTable + varNum;
+
+            noway_assert(varDsc->lvRegNum == REG_STK);
+            noway_assert(tree->IsRegOptional());
+
+            // Update the life of reg optional lcl var.
+            genUpdateLife(tree);
+        }
+        else
+        {
+            assert(tree->OperIsLeaf());
+        }
+    }
+    else
+    {
+        genConsumeReg(tree);
+    }
+}
+
+//------------------------------------------------------------------------
+// genConsumeOperands: Do liveness update for the operands of a unary or binary tree
+//
+// Arguments:
+//    tree - the GenTreeOp whose operands will have their liveness updated.
+//
+// Return Value:
+//    None.
+//
+// Notes:
+//    Note that this logic is localized here because we must do the liveness update in
+//    the correct execution order.  This is important because we may have two operands
+//    that involve the same lclVar, and if one is marked "lastUse" we must handle it
+//    after the first.
+
+void CodeGen::genConsumeOperands(GenTreeOp* tree)
+{
+    GenTree* firstOp  = tree->gtOp1;
+    GenTree* secondOp = tree->gtOp2;
+    if ((tree->gtFlags & GTF_REVERSE_OPS) != 0)
+    {
+        assert(secondOp != nullptr);
+        firstOp  = secondOp;
+        secondOp = tree->gtOp1;
+    }
+    if (firstOp != nullptr)
+    {
+        genConsumeRegs(firstOp);
+    }
+    if (secondOp != nullptr)
+    {
+        genConsumeRegs(secondOp);
+    }
+}
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+//------------------------------------------------------------------------
+// genConsumePutStructArgStk: Do liveness update for the operands of a PutArgStk node.
+//                      Also loads in the right register the addresses of the
+//                      src/dst for rep mov operation.
+//
+// Arguments:
+//    putArgNode - the PUTARG_STK tree.
+//    dstReg     - the dstReg for the rep move operation.
+//    srcReg     - the srcReg for the rep move operation.
+//    sizeReg    - the sizeReg for the rep move operation.
+//    baseVarNum - the varnum for the local used for placing the "by-value" args on the stack.
+//
+// Return Value:
+//    None.
+//
+// Note: sizeReg can be REG_NA when this function is used to consume the dstReg and srcReg
+//           for copying on the stack a struct with references.
+//       The source address/offset is determined from the address on the GT_OBJ node, while
+//       the destination address is the address contained in 'baseVarNum' plus the offset
+//       provided in the 'putArgNode'.
+
+void CodeGen::genConsumePutStructArgStk(
+    GenTreePutArgStk* putArgNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg, unsigned baseVarNum)
+{
+    assert(varTypeIsStruct(putArgNode));
+    assert(baseVarNum != BAD_VAR_NUM);
+
+    // The putArgNode children are always contained. We should not consume any registers.
+    assert(putArgNode->gtGetOp1()->isContained());
+
+    GenTree* dstAddr = putArgNode;
+
+    // Get the source address.
+    GenTree* src = putArgNode->gtGetOp1();
+    assert((src->gtOper == GT_OBJ) || ((src->gtOper == GT_IND && varTypeIsSIMD(src))));
+    GenTree* srcAddr = src->gtGetOp1();
+
+    size_t size = putArgNode->getArgSize();
+
+    assert(dstReg != REG_NA);
+    assert(srcReg != REG_NA);
+
+    // Consume the registers only if they are not contained or set to REG_NA.
+    if (srcAddr->gtRegNum != REG_NA)
+    {
+        genConsumeReg(srcAddr);
+    }
+
+    // If the op1 is already in the dstReg - nothing to do.
+    // Otherwise load the op1 (GT_ADDR) into the dstReg to copy the struct on the stack by value.
+    if (dstAddr->gtRegNum != dstReg)
+    {
+        // Generate LEA instruction to load the stack of the outgoing var + SlotNum offset (or the incoming arg area
+        // for tail calls) in RDI.
+        // Destination is always local (on the stack) - use EA_PTRSIZE.
+        getEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, dstReg, baseVarNum, putArgNode->getArgOffset());
+    }
+
+    if (srcAddr->gtRegNum != srcReg)
+    {
+        if (srcAddr->OperIsLocalAddr())
+        {
+            // The OperLocalAddr is always contained.
+            assert(srcAddr->isContained());
+            GenTreeLclVarCommon* lclNode = srcAddr->AsLclVarCommon();
+
+            // Generate LEA instruction to load the LclVar address in RSI.
+            // Source is known to be on the stack. Use EA_PTRSIZE.
+            unsigned int offset = 0;
+            if (srcAddr->OperGet() == GT_LCL_FLD_ADDR)
+            {
+                offset = srcAddr->AsLclFld()->gtLclOffs;
+            }
+            getEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, srcReg, lclNode->gtLclNum, offset);
+        }
+        else
+        {
+            assert(srcAddr->gtRegNum != REG_NA);
+            // Source is not known to be on the stack. Use EA_BYREF.
+            getEmitter()->emitIns_R_R(INS_mov, EA_BYREF, srcReg, srcAddr->gtRegNum);
+        }
+    }
+
+    if (sizeReg != REG_NA)
+    {
+        inst_RV_IV(INS_mov, sizeReg, size, EA_8BYTE);
+    }
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+//------------------------------------------------------------------------
+// genConsumeBlockSize: Ensure that the block size is in the given register
+//
+// Arguments:
+//    blkNode - The block node
+//    sizeReg - The register into which the block's size should go
+//
+
+void CodeGen::genConsumeBlockSize(GenTreeBlk* blkNode, regNumber sizeReg)
+{
+    if (sizeReg != REG_NA)
+    {
+        unsigned blockSize = blkNode->Size();
+        if (blockSize != 0)
+        {
+            assert(blkNode->gtRsvdRegs == genRegMask(sizeReg));
+            genSetRegToIcon(sizeReg, blockSize);
+        }
+        else
+        {
+            noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK);
+            genConsumeReg(blkNode->AsDynBlk()->gtDynamicSize);
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// genConsumeBlockDst: Ensure that the block destination address is in its
+//                     allocated register.
+// Arguments:
+//    blkNode - The block node
+//
+
+void CodeGen::genConsumeBlockDst(GenTreeBlk* blkNode)
+{
+    GenTree* dstAddr = blkNode->Addr();
+    genConsumeReg(dstAddr);
+}
+
+//------------------------------------------------------------------------
+// genConsumeBlockSrc: Ensure that the block source address is in its
+//                     allocated register if it is non-local.
+// Arguments:
+//    blkNode - The block node
+//
+// Return Value:
+//    Returns the source address node, if it is non-local,
+//    and nullptr otherwise.
+
+GenTree* CodeGen::genConsumeBlockSrc(GenTreeBlk* blkNode)
+{
+    GenTree* src = blkNode->Data();
+    if (blkNode->OperIsCopyBlkOp())
+    {
+        // For a CopyBlk we need the address of the source.
+        if (src->OperGet() == GT_IND)
+        {
+            src = src->gtOp.gtOp1;
+        }
+        else
+        {
+            // This must be a local.
+            // For this case, there is no source address register, as it is a
+            // stack-based address.
+            assert(src->OperIsLocal());
+            return nullptr;
+        }
+    }
+    genConsumeReg(src);
+    return src;
+}
+
+//------------------------------------------------------------------------
+// genConsumeBlockOp: Ensure that the block's operands are enregistered
+//                    as needed.
+// Arguments:
+//    blkNode - The block node
+//
+// Notes:
+//    This ensures that the operands are consumed in the proper order to
+//    obey liveness modeling.
+
+void CodeGen::genConsumeBlockOp(GenTreeBlk* blkNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg)
+{
+    // We have to consume the registers, and perform any copies, in the actual execution order.
+    // The nominal order is: dst, src, size.  However this may have been changed
+    // with reverse flags on the blkNode and the setting of gtEvalSizeFirst in the case of a dynamic
+    // block size.
+    // Note that the register allocator ensures that the registers ON THE NODES will not interfere
+    // with one another if consumed (i.e. reloaded or moved to their ASSIGNED reg) in execution order.
+    // Further, it ensures that they will not interfere with one another if they are then copied
+    // to the REQUIRED register (if a fixed register requirement) in execution order.  This requires,
+    // then, that we first consume all the operands, then do any necessary moves.
+
+    GenTree* dstAddr       = blkNode->Addr();
+    GenTree* src           = nullptr;
+    unsigned blockSize     = blkNode->Size();
+    GenTree* size          = nullptr;
+    bool     evalSizeFirst = true;
+
+    if (blkNode->OperGet() == GT_STORE_DYN_BLK)
+    {
+        evalSizeFirst = blkNode->AsDynBlk()->gtEvalSizeFirst;
+        size          = blkNode->AsDynBlk()->gtDynamicSize;
+    }
+
+    // First, consusme all the sources in order
+    if (evalSizeFirst)
+    {
+        genConsumeBlockSize(blkNode, sizeReg);
+    }
+    if (blkNode->IsReverseOp())
+    {
+        src = genConsumeBlockSrc(blkNode);
+        genConsumeBlockDst(blkNode);
+    }
+    else
+    {
+        genConsumeBlockDst(blkNode);
+        src = genConsumeBlockSrc(blkNode);
+    }
+    if (!evalSizeFirst)
+    {
+        genConsumeBlockSize(blkNode, sizeReg);
+    }
+    // Next, perform any necessary moves.
+    if (evalSizeFirst && (size != nullptr) && (size->gtRegNum != sizeReg))
+    {
+        inst_RV_RV(INS_mov, sizeReg, size->gtRegNum, size->TypeGet());
+    }
+    if (blkNode->IsReverseOp())
+    {
+        if ((src != nullptr) && (src->gtRegNum != srcReg))
+        {
+            inst_RV_RV(INS_mov, srcReg, src->gtRegNum, src->TypeGet());
+        }
+        if (dstAddr->gtRegNum != dstReg)
+        {
+            inst_RV_RV(INS_mov, dstReg, dstAddr->gtRegNum, dstAddr->TypeGet());
+        }
+    }
+    else
+    {
+        if (dstAddr->gtRegNum != dstReg)
+        {
+            inst_RV_RV(INS_mov, dstReg, dstAddr->gtRegNum, dstAddr->TypeGet());
+        }
+        if ((src != nullptr) && (src->gtRegNum != srcReg))
+        {
+            inst_RV_RV(INS_mov, srcReg, src->gtRegNum, src->TypeGet());
+        }
+    }
+    if (!evalSizeFirst && size != nullptr && (size->gtRegNum != sizeReg))
+    {
+        inst_RV_RV(INS_mov, sizeReg, size->gtRegNum, size->TypeGet());
+    }
+}
+
+//-------------------------------------------------------------------------
+// genProduceReg: do liveness update for register produced by the current
+// node in codegen.
+//
+// Arguments:
+//     tree   -  Gentree node
+//
+// Return Value:
+//     None.
+void CodeGen::genProduceReg(GenTree* tree)
+{
+    if (tree->gtFlags & GTF_SPILL)
+    {
+        // Code for GT_COPY node gets generated as part of consuming regs by its parent.
+        // A GT_COPY node in turn produces reg result and it should never be marked to
+        // spill.
+        //
+        // Similarly GT_RELOAD node gets generated as part of consuming regs by its
+        // parent and should never be marked for spilling.
+        noway_assert(!tree->IsCopyOrReload());
+
+        if (genIsRegCandidateLocal(tree))
+        {
+            // Store local variable to its home location.
+            tree->gtFlags &= ~GTF_REG_VAL;
+            // Ensure that lclVar stores are typed correctly.
+            unsigned varNum = tree->gtLclVarCommon.gtLclNum;
+            assert(!compiler->lvaTable[varNum].lvNormalizeOnStore() ||
+                   (tree->TypeGet() == genActualType(compiler->lvaTable[varNum].TypeGet())));
+            inst_TT_RV(ins_Store(tree->gtType, compiler->isSIMDTypeLocalAligned(varNum)), tree, tree->gtRegNum);
+        }
+        else
+        {
+            // In case of multi-reg call node, spill flag on call node
+            // indicates that one or more of its allocated regs need to
+            // be spilled.  Call node needs to be further queried to
+            // know which of its result regs needs to be spilled.
+            if (tree->IsMultiRegCall())
+            {
+                GenTreeCall*    call        = tree->AsCall();
+                ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+                unsigned        regCount    = retTypeDesc->GetReturnRegCount();
+
+                for (unsigned i = 0; i < regCount; ++i)
+                {
+                    unsigned flags = call->GetRegSpillFlagByIdx(i);
+                    if ((flags & GTF_SPILL) != 0)
+                    {
+                        regNumber reg = call->GetRegNumByIdx(i);
+                        call->SetInReg();
+                        regSet.rsSpillTree(reg, call, i);
+                        gcInfo.gcMarkRegSetNpt(genRegMask(reg));
+                    }
+                }
+            }
+            else
+            {
+                tree->SetInReg();
+                regSet.rsSpillTree(tree->gtRegNum, tree);
+                gcInfo.gcMarkRegSetNpt(genRegMask(tree->gtRegNum));
+            }
+
+            tree->gtFlags |= GTF_SPILLED;
+            tree->gtFlags &= ~GTF_SPILL;
+
+            return;
+        }
+    }
+
+    genUpdateLife(tree);
+
+    // If we've produced a register, mark it as a pointer, as needed.
+    if (tree->gtHasReg())
+    {
+        // We only mark the register in the following cases:
+        // 1. It is not a register candidate local. In this case, we're producing a
+        //    register from a local, but the local is not a register candidate. Thus,
+        //    we must be loading it as a temp register, and any "last use" flag on
+        //    the register wouldn't be relevant.
+        // 2. The register candidate local is going dead. There's no point to mark
+        //    the register as live, with a GC pointer, if the variable is dead.
+        if (!genIsRegCandidateLocal(tree) || ((tree->gtFlags & GTF_VAR_DEATH) == 0))
+        {
+            // Multi-reg call node will produce more than one register result.
+            // Mark all the regs produced by call node.
+            if (tree->IsMultiRegCall())
+            {
+                GenTreeCall*    call        = tree->AsCall();
+                ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+                unsigned        regCount    = retTypeDesc->GetReturnRegCount();
+
+                for (unsigned i = 0; i < regCount; ++i)
+                {
+                    regNumber reg  = call->GetRegNumByIdx(i);
+                    var_types type = retTypeDesc->GetReturnRegType(i);
+                    gcInfo.gcMarkRegPtrVal(reg, type);
+                }
+            }
+            else if (tree->IsCopyOrReloadOfMultiRegCall())
+            {
+                // we should never see reload of multi-reg call here
+                // because GT_RELOAD gets generated in reg consuming path.
+                noway_assert(tree->OperGet() == GT_COPY);
+
+                // A multi-reg GT_COPY node produces those regs to which
+                // copy has taken place.
+                GenTreeCopyOrReload* copy        = tree->AsCopyOrReload();
+                GenTreeCall*         call        = copy->gtGetOp1()->AsCall();
+                ReturnTypeDesc*      retTypeDesc = call->GetReturnTypeDesc();
+                unsigned             regCount    = retTypeDesc->GetReturnRegCount();
+
+                for (unsigned i = 0; i < regCount; ++i)
+                {
+                    var_types type    = retTypeDesc->GetReturnRegType(i);
+                    regNumber fromReg = call->GetRegNumByIdx(i);
+                    regNumber toReg   = copy->GetRegNumByIdx(i);
+
+                    if (toReg != REG_NA)
+                    {
+                        gcInfo.gcMarkRegPtrVal(toReg, type);
+                    }
+                }
+            }
+            else
+            {
+                gcInfo.gcMarkRegPtrVal(tree->gtRegNum, tree->TypeGet());
+            }
+        }
+    }
+    tree->SetInReg();
+}
+
+// transfer gc/byref status of src reg to dst reg
+void CodeGen::genTransferRegGCState(regNumber dst, regNumber src)
+{
+    regMaskTP srcMask = genRegMask(src);
+    regMaskTP dstMask = genRegMask(dst);
+
+    if (gcInfo.gcRegGCrefSetCur & srcMask)
+    {
+        gcInfo.gcMarkRegSetGCref(dstMask);
+    }
+    else if (gcInfo.gcRegByrefSetCur & srcMask)
+    {
+        gcInfo.gcMarkRegSetByref(dstMask);
+    }
+    else
+    {
+        gcInfo.gcMarkRegSetNpt(dstMask);
+    }
+}
+
+// generates an ip-relative call or indirect call via reg ('call reg')
+//     pass in 'addr' for a relative call or 'base' for a indirect register call
+//     methHnd - optional, only used for pretty printing
+//     retSize - emitter type of return for GC purposes, should be EA_BYREF, EA_GCREF, or EA_PTRSIZE(not GC)
+void CodeGen::genEmitCall(int                   callType,
+                          CORINFO_METHOD_HANDLE methHnd,
+                          INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo) void* addr X86_ARG(ssize_t argSize),
+                          emitAttr retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize),
+                          IL_OFFSETX ilOffset,
+                          regNumber  base,
+                          bool       isJump,
+                          bool       isNoGC)
+{
+#if !defined(_TARGET_X86_)
+    ssize_t argSize = 0;
+#endif // !defined(_TARGET_X86_)
+    getEmitter()->emitIns_Call(emitter::EmitCallType(callType), methHnd, INDEBUG_LDISASM_COMMA(sigInfo) addr, argSize,
+                               retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), gcInfo.gcVarPtrSetCur,
+                               gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur, ilOffset, base, REG_NA, 0, 0, isJump,
+                               emitter::emitNoGChelper(compiler->eeGetHelperNum(methHnd)));
+}
+
+// generates an indirect call via addressing mode (call []) given an indir node
+//     methHnd - optional, only used for pretty printing
+//     retSize - emitter type of return for GC purposes, should be EA_BYREF, EA_GCREF, or EA_PTRSIZE(not GC)
+void CodeGen::genEmitCall(int                   callType,
+                          CORINFO_METHOD_HANDLE methHnd,
+                          INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo) GenTreeIndir* indir X86_ARG(ssize_t argSize),
+                          emitAttr retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize),
+                          IL_OFFSETX ilOffset)
+{
+#if !defined(_TARGET_X86_)
+    ssize_t argSize = 0;
+#endif // !defined(_TARGET_X86_)
+    genConsumeAddress(indir->Addr());
+
+    getEmitter()->emitIns_Call(emitter::EmitCallType(callType), methHnd, INDEBUG_LDISASM_COMMA(sigInfo) nullptr,
+                               argSize, retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
+                               gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur, ilOffset,
+                               indir->Base() ? indir->Base()->gtRegNum : REG_NA,
+                               indir->Index() ? indir->Index()->gtRegNum : REG_NA, indir->Scale(), indir->Offset());
+}
+
+//------------------------------------------------------------------------
+// genStoreInd: Generate code for a GT_STOREIND node.
+//
+// Arguments:
+//    treeNode - The GT_STOREIND node for which to generate code.
+//
+// Return Value:
+//    none
+
+void CodeGen::genStoreInd(GenTreePtr node)
+{
+    assert(node->OperGet() == GT_STOREIND);
+
+#ifdef FEATURE_SIMD
+    // Storing Vector3 of size 12 bytes through indirection
+    if (node->TypeGet() == TYP_SIMD12)
+    {
+        genStoreIndTypeSIMD12(node);
+        return;
+    }
+#endif // FEATURE_SIMD
+
+    GenTreeStoreInd* storeInd   = node->AsStoreInd();
+    GenTree*         data       = storeInd->Data();
+    GenTree*         addr       = storeInd->Addr();
+    var_types        targetType = storeInd->TypeGet();
+
+    assert(!varTypeIsFloating(targetType) || (targetType == data->TypeGet()));
+
+    GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(storeInd, data);
+    if (writeBarrierForm != GCInfo::WBF_NoBarrier)
+    {
+        // data and addr must be in registers.
+        // Consume both registers so that any copies of interfering registers are taken care of.
+        genConsumeOperands(storeInd->AsOp());
+
+        if (genEmitOptimizedGCWriteBarrier(writeBarrierForm, addr, data))
+        {
+            return;
+        }
+
+        // At this point, we should not have any interference.
+        // That is, 'data' must not be in REG_ARG_0, as that is where 'addr' must go.
+        noway_assert(data->gtRegNum != REG_ARG_0);
+
+        // addr goes in REG_ARG_0
+        if (addr->gtRegNum != REG_ARG_0)
+        {
+            inst_RV_RV(INS_mov, REG_ARG_0, addr->gtRegNum, addr->TypeGet());
+        }
+
+        // data goes in REG_ARG_1
+        if (data->gtRegNum != REG_ARG_1)
+        {
+            inst_RV_RV(INS_mov, REG_ARG_1, data->gtRegNum, data->TypeGet());
+        }
+
+        genGCWriteBarrier(storeInd, writeBarrierForm);
+    }
+    else
+    {
+        bool     reverseOps    = ((storeInd->gtFlags & GTF_REVERSE_OPS) != 0);
+        bool     dataIsUnary   = false;
+        bool     isRMWMemoryOp = storeInd->IsRMWMemoryOp();
+        GenTree* rmwSrc        = nullptr;
+
+        // We must consume the operands in the proper execution order, so that liveness is
+        // updated appropriately.
+        if (!reverseOps)
+        {
+            genConsumeAddress(addr);
+        }
+
+        // If storeInd represents a RMW memory op then its data is a non-leaf node marked as contained
+        // and non-indir operand of data is the source of RMW memory op.
+        if (isRMWMemoryOp)
+        {
+            assert(data->isContained() && !data->OperIsLeaf());
+
+            GenTreePtr rmwDst = nullptr;
+
+            dataIsUnary = (GenTree::OperIsUnary(data->OperGet()) != 0);
+            if (!dataIsUnary)
+            {
+                if (storeInd->IsRMWDstOp1())
+                {
+                    rmwDst = data->gtGetOp1();
+                    rmwSrc = data->gtGetOp2();
+                }
+                else
+                {
+                    assert(storeInd->IsRMWDstOp2());
+                    rmwDst = data->gtGetOp2();
+                    rmwSrc = data->gtGetOp1();
+                }
+
+                genConsumeRegs(rmwSrc);
+            }
+            else
+            {
+                // *(p) = oper *(p): Here addr = p, rmwsrc=rmwDst = *(p) i.e. GT_IND(p)
+                // For unary RMW ops, src and dst of RMW memory op is the same.  Lower
+                // clears operand counts on rmwSrc and we don't need to perform a
+                // genConsumeReg() on it.
+                assert(storeInd->IsRMWDstOp1());
+                rmwSrc = data->gtGetOp1();
+                rmwDst = data->gtGetOp1();
+                assert(rmwSrc->isContained());
+            }
+
+            assert(rmwSrc != nullptr);
+            assert(rmwDst != nullptr);
+            assert(Lowering::IndirsAreEquivalent(rmwDst, storeInd));
+        }
+        else
+        {
+            genConsumeRegs(data);
+        }
+
+        if (reverseOps)
+        {
+            genConsumeAddress(addr);
+        }
+
+        if (isRMWMemoryOp)
+        {
+            if (dataIsUnary)
+            {
+                // generate code for unary RMW memory ops like neg/not
+                getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(storeInd),
+                                         storeInd);
+            }
+            else
+            {
+                if (data->OperIsShiftOrRotate())
+                {
+                    // Generate code for shift RMW memory ops.
+                    // The data address needs to be op1 (it must be [addr] = [addr] <shift> <amount>, not [addr] =
+                    // <amount> <shift> [addr]).
+                    assert(storeInd->IsRMWDstOp1());
+                    assert(rmwSrc == data->gtGetOp2());
+                    genCodeForShiftRMW(storeInd);
+                }
+                else
+                {
+                    // generate code for remaining binary RMW memory ops like add/sub/and/or/xor
+                    getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(storeInd),
+                                             storeInd, rmwSrc);
+                }
+            }
+        }
+        else
+        {
+            getEmitter()->emitInsMov(ins_Store(data->TypeGet()), emitTypeSize(storeInd), storeInd);
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// genEmitOptimizedGCWriteBarrier: Generate write barrier store using the optimized
+// helper functions.
+//
+// Arguments:
+//    writeBarrierForm - the write barrier form to use
+//    addr - the address at which to do the store
+//    data - the data to store
+//
+// Return Value:
+//    true if an optimized write barrier form was used, false if not. If this
+//    function returns false, the caller must emit a "standard" write barrier.
+
+bool CodeGen::genEmitOptimizedGCWriteBarrier(GCInfo::WriteBarrierForm writeBarrierForm, GenTree* addr, GenTree* data)
+{
+    assert(writeBarrierForm != GCInfo::WBF_NoBarrier);
+
+#if defined(_TARGET_X86_) && NOGC_WRITE_BARRIERS
+    bool useOptimizedWriteBarriers = true;
+
+#ifdef DEBUG
+    useOptimizedWriteBarriers =
+        (writeBarrierForm != GCInfo::WBF_NoBarrier_CheckNotHeapInDebug); // This one is always a call to a C++ method.
+#endif
+
+    if (!useOptimizedWriteBarriers)
+    {
+        return false;
+    }
+
+    const static int regToHelper[2][8] = {
+        // If the target is known to be in managed memory
+        {
+            CORINFO_HELP_ASSIGN_REF_EAX, CORINFO_HELP_ASSIGN_REF_ECX, -1, CORINFO_HELP_ASSIGN_REF_EBX, -1,
+            CORINFO_HELP_ASSIGN_REF_EBP, CORINFO_HELP_ASSIGN_REF_ESI, CORINFO_HELP_ASSIGN_REF_EDI,
+        },
+
+        // Don't know if the target is in managed memory
+        {
+            CORINFO_HELP_CHECKED_ASSIGN_REF_EAX, CORINFO_HELP_CHECKED_ASSIGN_REF_ECX, -1,
+            CORINFO_HELP_CHECKED_ASSIGN_REF_EBX, -1, CORINFO_HELP_CHECKED_ASSIGN_REF_EBP,
+            CORINFO_HELP_CHECKED_ASSIGN_REF_ESI, CORINFO_HELP_CHECKED_ASSIGN_REF_EDI,
+        },
+    };
+
+    noway_assert(regToHelper[0][REG_EAX] == CORINFO_HELP_ASSIGN_REF_EAX);
+    noway_assert(regToHelper[0][REG_ECX] == CORINFO_HELP_ASSIGN_REF_ECX);
+    noway_assert(regToHelper[0][REG_EBX] == CORINFO_HELP_ASSIGN_REF_EBX);
+    noway_assert(regToHelper[0][REG_ESP] == -1);
+    noway_assert(regToHelper[0][REG_EBP] == CORINFO_HELP_ASSIGN_REF_EBP);
+    noway_assert(regToHelper[0][REG_ESI] == CORINFO_HELP_ASSIGN_REF_ESI);
+    noway_assert(regToHelper[0][REG_EDI] == CORINFO_HELP_ASSIGN_REF_EDI);
+
+    noway_assert(regToHelper[1][REG_EAX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EAX);
+    noway_assert(regToHelper[1][REG_ECX] == CORINFO_HELP_CHECKED_ASSIGN_REF_ECX);
+    noway_assert(regToHelper[1][REG_EBX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBX);
+    noway_assert(regToHelper[1][REG_ESP] == -1);
+    noway_assert(regToHelper[1][REG_EBP] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBP);
+    noway_assert(regToHelper[1][REG_ESI] == CORINFO_HELP_CHECKED_ASSIGN_REF_ESI);
+    noway_assert(regToHelper[1][REG_EDI] == CORINFO_HELP_CHECKED_ASSIGN_REF_EDI);
+
+    regNumber reg = data->gtRegNum;
+    noway_assert((reg != REG_ESP) && (reg != REG_WRITE_BARRIER));
+
+    // Generate the following code:
+    //            lea     edx, addr
+    //            call    write_barrier_helper_reg
+
+    // addr goes in REG_ARG_0
+    if (addr->gtRegNum != REG_WRITE_BARRIER) // REVIEW: can it ever not already by in this register?
+    {
+        inst_RV_RV(INS_mov, REG_WRITE_BARRIER, addr->gtRegNum, addr->TypeGet());
+    }
+
+    unsigned tgtAnywhere = 0;
+    if (writeBarrierForm != GCInfo::WBF_BarrierUnchecked)
+    {
+        tgtAnywhere = 1;
+    }
+
+    // We might want to call a modified version of genGCWriteBarrier() to get the benefit of
+    // the FEATURE_COUNT_GC_WRITE_BARRIERS code there, but that code doesn't look like it works
+    // with rationalized RyuJIT IR. So, for now, just emit the helper call directly here.
+
+    genEmitHelperCall(regToHelper[tgtAnywhere][reg],
+                      0,           // argSize
+                      EA_PTRSIZE); // retSize
+
+    return true;
+#else  // !defined(_TARGET_X86_) || !NOGC_WRITE_BARRIERS
+    return false;
+#endif // !defined(_TARGET_X86_) || !NOGC_WRITE_BARRIERS
+}
+
+// Produce code for a GT_CALL node
+void CodeGen::genCallInstruction(GenTreePtr node)
+{
+    GenTreeCall* call = node->AsCall();
+    assert(call->gtOper == GT_CALL);
+
+    gtCallTypes callType = (gtCallTypes)call->gtCallType;
+
+    IL_OFFSETX ilOffset = BAD_IL_OFFSET;
+
+    // all virtuals should have been expanded into a control expression
+    assert(!call->IsVirtual() || call->gtControlExpr || call->gtCallAddr);
+
+    // Consume all the arg regs
+    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+    {
+        assert(list->IsList());
+
+        GenTreePtr argNode = list->Current();
+
+        fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode->gtSkipReloadOrCopy());
+        assert(curArgTabEntry);
+
+        if (curArgTabEntry->regNum == REG_STK)
+        {
+            continue;
+        }
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // Deal with multi register passed struct args.
+        if (argNode->OperGet() == GT_LIST)
+        {
+            GenTreeArgList* argListPtr   = argNode->AsArgList();
+            unsigned        iterationNum = 0;
+            for (; argListPtr != nullptr; argListPtr = argListPtr->Rest(), iterationNum++)
+            {
+                GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1;
+                assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+                regNumber argReg = REG_NA;
+
+                if (iterationNum == 0)
+                {
+                    argReg = curArgTabEntry->regNum;
+                }
+                else
+                {
+                    assert(iterationNum == 1);
+                    argReg = curArgTabEntry->otherRegNum;
+                }
+
+                genConsumeReg(putArgRegNode);
+
+                // Validate the putArgRegNode has the right type.
+                assert(putArgRegNode->TypeGet() ==
+                       compiler->GetTypeFromClassificationAndSizes(curArgTabEntry->structDesc
+                                                                       .eightByteClassifications[iterationNum],
+                                                                   curArgTabEntry->structDesc
+                                                                       .eightByteSizes[iterationNum]));
+                if (putArgRegNode->gtRegNum != argReg)
+                {
+                    inst_RV_RV(ins_Move_Extend(putArgRegNode->TypeGet(), putArgRegNode->InReg()), argReg,
+                               putArgRegNode->gtRegNum);
+                }
+            }
+        }
+        else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+        {
+            regNumber argReg = curArgTabEntry->regNum;
+            genConsumeReg(argNode);
+            if (argNode->gtRegNum != argReg)
+            {
+                inst_RV_RV(ins_Move_Extend(argNode->TypeGet(), argNode->InReg()), argReg, argNode->gtRegNum);
+            }
+        }
+
+#if FEATURE_VARARG
+        // In the case of a varargs call,
+        // the ABI dictates that if we have floating point args,
+        // we must pass the enregistered arguments in both the
+        // integer and floating point registers so, let's do that.
+        if (call->IsVarargs() && varTypeIsFloating(argNode))
+        {
+            regNumber   targetReg = compiler->getCallArgIntRegister(argNode->gtRegNum);
+            instruction ins       = ins_CopyFloatToInt(argNode->TypeGet(), TYP_LONG);
+            inst_RV_RV(ins, argNode->gtRegNum, targetReg);
+        }
+#endif // FEATURE_VARARG
+    }
+
+#if defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+    // The call will pop its arguments.
+    // for each putarg_stk:
+    ssize_t    stackArgBytes = 0;
+    GenTreePtr args          = call->gtCallArgs;
+    while (args)
+    {
+        GenTreePtr arg = args->gtOp.gtOp1;
+        if (arg->OperGet() != GT_ARGPLACE && !(arg->gtFlags & GTF_LATE_ARG))
+        {
+#if defined(_TARGET_X86_)
+            assert((arg->OperGet() == GT_PUTARG_STK) || (arg->OperGet() == GT_LONG));
+            if (arg->OperGet() == GT_LONG)
+            {
+                assert((arg->gtGetOp1()->OperGet() == GT_PUTARG_STK) && (arg->gtGetOp2()->OperGet() == GT_PUTARG_STK));
+            }
+#endif // defined(_TARGET_X86_)
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (genActualType(arg->TypeGet()) == TYP_STRUCT)
+            {
+                assert(arg->OperGet() == GT_PUTARG_STK);
+
+                GenTreeObj* obj = arg->gtGetOp1()->AsObj();
+                stackArgBytes   = compiler->info.compCompHnd->getClassSize(obj->gtClass);
+            }
+            else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+                stackArgBytes += genTypeSize(genActualType(arg->TypeGet()));
+        }
+        args = args->gtOp.gtOp2;
+    }
+#endif // defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+    // Insert a null check on "this" pointer if asked.
+    if (call->NeedsNullCheck())
+    {
+        const regNumber regThis = genGetThisArgReg(call);
+        getEmitter()->emitIns_AR_R(INS_cmp, EA_4BYTE, regThis, regThis, 0);
+    }
+
+    // Either gtControlExpr != null or gtCallAddr != null or it is a direct non-virtual call to a user or helper method.
+    CORINFO_METHOD_HANDLE methHnd;
+    GenTree*              target = call->gtControlExpr;
+    if (callType == CT_INDIRECT)
+    {
+        assert(target == nullptr);
+        target  = call->gtCall.gtCallAddr;
+        methHnd = nullptr;
+    }
+    else
+    {
+        methHnd = call->gtCallMethHnd;
+    }
+
+    CORINFO_SIG_INFO* sigInfo = nullptr;
+#ifdef DEBUG
+    // Pass the call signature information down into the emitter so the emitter can associate
+    // native call sites with the signatures they were generated from.
+    if (callType != CT_HELPER)
+    {
+        sigInfo = call->callSig;
+    }
+#endif // DEBUG
+
+    // If fast tail call, then we are done.  In this case we setup the args (both reg args
+    // and stack args in incoming arg area) and call target in rax.  Epilog sequence would
+    // generate "jmp rax".
+    if (call->IsFastTailCall())
+    {
+        // Don't support fast tail calling JIT helpers
+        assert(callType != CT_HELPER);
+
+        // Fast tail calls materialize call target either in gtControlExpr or in gtCallAddr.
+        assert(target != nullptr);
+
+        genConsumeReg(target);
+        if (target->gtRegNum != REG_RAX)
+        {
+            inst_RV_RV(INS_mov, REG_RAX, target->gtRegNum);
+        }
+        return;
+    }
+
+    // For a pinvoke to unmanged code we emit a label to clear
+    // the GC pointer state before the callsite.
+    // We can't utilize the typical lazy killing of GC pointers
+    // at (or inside) the callsite.
+    if (call->IsUnmanaged())
+    {
+        genDefineTempLabel(genCreateTempLabel());
+    }
+
+    // Determine return value size(s).
+    ReturnTypeDesc* retTypeDesc   = call->GetReturnTypeDesc();
+    emitAttr        retSize       = EA_PTRSIZE;
+    emitAttr        secondRetSize = EA_UNKNOWN;
+
+    if (call->HasMultiRegRetVal())
+    {
+        retSize       = emitTypeSize(retTypeDesc->GetReturnRegType(0));
+        secondRetSize = emitTypeSize(retTypeDesc->GetReturnRegType(1));
+    }
+    else
+    {
+        assert(!varTypeIsStruct(call));
+
+        if (call->gtType == TYP_REF || call->gtType == TYP_ARRAY)
+        {
+            retSize = EA_GCREF;
+        }
+        else if (call->gtType == TYP_BYREF)
+        {
+            retSize = EA_BYREF;
+        }
+    }
+
+    bool            fPossibleSyncHelperCall = false;
+    CorInfoHelpFunc helperNum               = CORINFO_HELP_UNDEF;
+
+#ifdef DEBUGGING_SUPPORT
+    // We need to propagate the IL offset information to the call instruction, so we can emit
+    // an IL to native mapping record for the call, to support managed return value debugging.
+    // We don't want tail call helper calls that were converted from normal calls to get a record,
+    // so we skip this hash table lookup logic in that case.
+    if (compiler->opts.compDbgInfo && compiler->genCallSite2ILOffsetMap != nullptr && !call->IsTailCall())
+    {
+        (void)compiler->genCallSite2ILOffsetMap->Lookup(call, &ilOffset);
+    }
+#endif // DEBUGGING_SUPPORT
+
+#if defined(_TARGET_X86_)
+    // If the callee pops the arguments, we pass a positive value as the argSize, and the emitter will
+    // adjust its stack level accordingly.
+    // If the caller needs to explicitly pop its arguments, we must pass a negative value, and then do the
+    // pop when we're done.
+    ssize_t argSizeForEmitter = stackArgBytes;
+    if ((call->gtFlags & GTF_CALL_POP_ARGS) != 0)
+    {
+        argSizeForEmitter = -stackArgBytes;
+    }
+
+#endif // defined(_TARGET_X86_)
+
+    if (target != nullptr)
+    {
+        if (target->isContainedIndir())
+        {
+            if (target->AsIndir()->HasBase() && target->AsIndir()->Base()->isContainedIntOrIImmed())
+            {
+                // Note that if gtControlExpr is an indir of an absolute address, we mark it as
+                // contained only if it can be encoded as PC-relative offset.
+                assert(target->AsIndir()->Base()->AsIntConCommon()->FitsInAddrBase(compiler));
+
+                genEmitCall(emitter::EC_FUNC_TOKEN_INDIR, methHnd,
+                            INDEBUG_LDISASM_COMMA(sigInfo)(void*) target->AsIndir()
+                                ->Base()
+                                ->AsIntConCommon()
+                                ->IconValue() X86_ARG(argSizeForEmitter),
+                            retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), ilOffset);
+            }
+            else
+            {
+                genEmitCall(emitter::EC_INDIR_ARD, methHnd,
+                            INDEBUG_LDISASM_COMMA(sigInfo) target->AsIndir() X86_ARG(argSizeForEmitter),
+                            retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), ilOffset);
+            }
+        }
+        else
+        {
+            // We have already generated code for gtControlExpr evaluating it into a register.
+            // We just need to emit "call reg" in this case.
+            assert(genIsValidIntReg(target->gtRegNum));
+            genEmitCall(emitter::EC_INDIR_R, methHnd,
+                        INDEBUG_LDISASM_COMMA(sigInfo) nullptr // addr
+                        X86_ARG(argSizeForEmitter),
+                        retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), ilOffset, genConsumeReg(target));
+        }
+    }
+#ifdef FEATURE_READYTORUN_COMPILER
+    else if (call->gtEntryPoint.addr != nullptr)
+    {
+        genEmitCall((call->gtEntryPoint.accessType == IAT_VALUE) ? emitter::EC_FUNC_TOKEN
+                                                                 : emitter::EC_FUNC_TOKEN_INDIR,
+                    methHnd, INDEBUG_LDISASM_COMMA(sigInfo)(void*) call->gtEntryPoint.addr X86_ARG(argSizeForEmitter),
+                    retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), ilOffset);
+    }
+#endif
+    else
+    {
+        // Generate a direct call to a non-virtual user defined or helper method
+        assert(callType == CT_HELPER || callType == CT_USER_FUNC);
+
+        void* addr = nullptr;
+        if (callType == CT_HELPER)
+        {
+            // Direct call to a helper method.
+            helperNum = compiler->eeGetHelperNum(methHnd);
+            noway_assert(helperNum != CORINFO_HELP_UNDEF);
+
+            void* pAddr = nullptr;
+            addr        = compiler->compGetHelperFtn(helperNum, (void**)&pAddr);
+
+            if (addr == nullptr)
+            {
+                addr = pAddr;
+            }
+
+            // tracking of region protected by the monitor in synchronized methods
+            if (compiler->info.compFlags & CORINFO_FLG_SYNCH)
+            {
+                fPossibleSyncHelperCall = true;
+            }
+        }
+        else
+        {
+            // Direct call to a non-virtual user function.
+            addr = call->gtDirectCallAddress;
+        }
+
+        // Non-virtual direct calls to known addresses
+        genEmitCall(emitter::EC_FUNC_TOKEN, methHnd, INDEBUG_LDISASM_COMMA(sigInfo) addr X86_ARG(argSizeForEmitter),
+                    retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), ilOffset);
+    }
+
+    // if it was a pinvoke we may have needed to get the address of a label
+    if (genPendingCallLabel)
+    {
+        assert(call->IsUnmanaged());
+        genDefineTempLabel(genPendingCallLabel);
+        genPendingCallLabel = nullptr;
+    }
+
+#if defined(_TARGET_X86_)
+    // The call will pop its arguments.
+    genStackLevel -= stackArgBytes;
+#endif // defined(_TARGET_X86_)
+
+    // Update GC info:
+    // All Callee arg registers are trashed and no longer contain any GC pointers.
+    // TODO-XArch-Bug?: As a matter of fact shouldn't we be killing all of callee trashed regs here?
+    // For now we will assert that other than arg regs gc ref/byref set doesn't contain any other
+    // registers from RBM_CALLEE_TRASH.
+    assert((gcInfo.gcRegGCrefSetCur & (RBM_CALLEE_TRASH & ~RBM_ARG_REGS)) == 0);
+    assert((gcInfo.gcRegByrefSetCur & (RBM_CALLEE_TRASH & ~RBM_ARG_REGS)) == 0);
+    gcInfo.gcRegGCrefSetCur &= ~RBM_ARG_REGS;
+    gcInfo.gcRegByrefSetCur &= ~RBM_ARG_REGS;
+
+    var_types returnType = call->TypeGet();
+    if (returnType != TYP_VOID)
+    {
+#ifdef _TARGET_X86_
+        if (varTypeIsFloating(returnType))
+        {
+            // Spill the value from the fp stack.
+            // Then, load it into the target register.
+            call->gtFlags |= GTF_SPILL;
+            regSet.rsSpillFPStack(call);
+            call->gtFlags |= GTF_SPILLED;
+            call->gtFlags &= ~GTF_SPILL;
+        }
+        else
+#endif // _TARGET_X86_
+        {
+            regNumber returnReg;
+
+            if (call->HasMultiRegRetVal())
+            {
+                assert(retTypeDesc != nullptr);
+                unsigned regCount = retTypeDesc->GetReturnRegCount();
+
+                // If regs allocated to call node are different from ABI return
+                // regs in which the call has returned its result, move the result
+                // to regs allocated to call node.
+                for (unsigned i = 0; i < regCount; ++i)
+                {
+                    var_types regType      = retTypeDesc->GetReturnRegType(i);
+                    returnReg              = retTypeDesc->GetABIReturnReg(i);
+                    regNumber allocatedReg = call->GetRegNumByIdx(i);
+                    if (returnReg != allocatedReg)
+                    {
+                        inst_RV_RV(ins_Copy(regType), allocatedReg, returnReg, regType);
+                    }
+                }
+
+#ifdef FEATURE_SIMD
+                // A Vector3 return value is stored in xmm0 and xmm1.
+                // RyuJIT assumes that the upper unused bits of xmm1 are cleared but
+                // the native compiler doesn't guarantee it.
+                if (returnType == TYP_SIMD12)
+                {
+                    returnReg = retTypeDesc->GetABIReturnReg(1);
+                    // Clear the upper 32 bits by two shift instructions.
+                    // retReg = retReg << 96
+                    // retReg = retReg >> 96
+                    getEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), returnReg, 12);
+                    getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), returnReg, 12);
+                }
+#endif // FEATURE_SIMD
+            }
+            else
+            {
+#ifdef _TARGET_X86_
+                if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
+                {
+                    // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
+                    // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
+                    // correct argument registers.
+                    returnReg = REG_PINVOKE_TCB;
+                }
+                else
+#endif // _TARGET_X86_
+                    if (varTypeIsFloating(returnType))
+                {
+                    returnReg = REG_FLOATRET;
+                }
+                else
+                {
+                    returnReg = REG_INTRET;
+                }
+
+                if (call->gtRegNum != returnReg)
+                {
+                    inst_RV_RV(ins_Copy(returnType), call->gtRegNum, returnReg, returnType);
+                }
+            }
+
+            genProduceReg(call);
+        }
+    }
+
+    // If there is nothing next, that means the result is thrown away, so this value is not live.
+    // However, for minopts or debuggable code, we keep it live to support managed return value debugging.
+    if ((call->gtNext == nullptr) && !compiler->opts.MinOpts() && !compiler->opts.compDbgCode)
+    {
+        gcInfo.gcMarkRegSetNpt(RBM_INTRET);
+    }
+
+#if defined(_TARGET_X86_)
+    //-------------------------------------------------------------------------
+    // Create a label for tracking of region protected by the monitor in synchronized methods.
+    // This needs to be here, rather than above where fPossibleSyncHelperCall is set,
+    // so the GC state vars have been updated before creating the label.
+
+    if (fPossibleSyncHelperCall)
+    {
+        switch (helperNum)
+        {
+            case CORINFO_HELP_MON_ENTER:
+            case CORINFO_HELP_MON_ENTER_STATIC:
+                noway_assert(compiler->syncStartEmitCookie == NULL);
+                compiler->syncStartEmitCookie =
+                    getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur);
+                noway_assert(compiler->syncStartEmitCookie != NULL);
+                break;
+            case CORINFO_HELP_MON_EXIT:
+            case CORINFO_HELP_MON_EXIT_STATIC:
+                noway_assert(compiler->syncEndEmitCookie == NULL);
+                compiler->syncEndEmitCookie =
+                    getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur);
+                noway_assert(compiler->syncEndEmitCookie != NULL);
+                break;
+            default:
+                break;
+        }
+    }
+
+    // Is the caller supposed to pop the arguments?
+    if (((call->gtFlags & GTF_CALL_POP_ARGS) != 0) && (stackArgBytes != 0))
+    {
+        genAdjustSP(stackArgBytes);
+    }
+#endif // _TARGET_X86_
+}
+
+// Produce code for a GT_JMP node.
+// The arguments of the caller needs to be transferred to the callee before exiting caller.
+// The actual jump to callee is generated as part of caller epilog sequence.
+// Therefore the codegen of GT_JMP is to ensure that the callee arguments are correctly setup.
+void CodeGen::genJmpMethod(GenTreePtr jmp)
+{
+    assert(jmp->OperGet() == GT_JMP);
+    assert(compiler->compJmpOpUsed);
+
+    // If no arguments, nothing to do
+    if (compiler->info.compArgsCount == 0)
+    {
+        return;
+    }
+
+    // Make sure register arguments are in their initial registers
+    // and stack arguments are put back as well.
+    unsigned   varNum;
+    LclVarDsc* varDsc;
+
+    // First move any en-registered stack arguments back to the stack.
+    // At the same time any reg arg not in correct reg is moved back to its stack location.
+    //
+    // We are not strictly required to spill reg args that are not in the desired reg for a jmp call
+    // But that would require us to deal with circularity while moving values around.  Spilling
+    // to stack makes the implementation simple, which is not a bad trade off given Jmp calls
+    // are not frequent.
+    for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++)
+    {
+        varDsc = compiler->lvaTable + varNum;
+
+        if (varDsc->lvPromoted)
+        {
+            noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here
+
+            unsigned fieldVarNum = varDsc->lvFieldLclStart;
+            varDsc               = compiler->lvaTable + fieldVarNum;
+        }
+        noway_assert(varDsc->lvIsParam);
+
+        if (varDsc->lvIsRegArg && (varDsc->lvRegNum != REG_STK))
+        {
+            // Skip reg args which are already in its right register for jmp call.
+            // If not, we will spill such args to their stack locations.
+            //
+            // If we need to generate a tail call profiler hook, then spill all
+            // arg regs to free them up for the callback.
+            if (!compiler->compIsProfilerHookNeeded() && (varDsc->lvRegNum == varDsc->lvArgReg))
+            {
+                continue;
+            }
+        }
+        else if (varDsc->lvRegNum == REG_STK)
+        {
+            // Skip args which are currently living in stack.
+            continue;
+        }
+
+        // If we came here it means either a reg argument not in the right register or
+        // a stack argument currently living in a register.  In either case the following
+        // assert should hold.
+        assert(varDsc->lvRegNum != REG_STK);
+
+        var_types loadType = varDsc->lvaArgType();
+        getEmitter()->emitIns_S_R(ins_Store(loadType), emitTypeSize(loadType), varDsc->lvRegNum, varNum, 0);
+
+        // Update lvRegNum life and GC info to indicate lvRegNum is dead and varDsc stack slot is going live.
+        // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
+        // Therefore manually update life of varDsc->lvRegNum.
+        regMaskTP tempMask = varDsc->lvRegMask();
+        regSet.RemoveMaskVars(tempMask);
+        gcInfo.gcMarkRegSetNpt(tempMask);
+        if (compiler->lvaIsGCTracked(varDsc))
+        {
+#ifdef DEBUG
+            if (!VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
+            {
+                JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming live\n", varNum);
+            }
+            else
+            {
+                JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing live\n", varNum);
+            }
+#endif // DEBUG
+
+            VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+        }
+    }
+
+#ifdef PROFILING_SUPPORTED
+    // At this point all arg regs are free.
+    // Emit tail call profiler callback.
+    genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL);
+#endif
+
+    // Next move any un-enregistered register arguments back to their register.
+    regMaskTP fixedIntArgMask = RBM_NONE;    // tracks the int arg regs occupying fixed args in case of a vararg method.
+    unsigned  firstArgVarNum  = BAD_VAR_NUM; // varNum of the first argument in case of a vararg method.
+    for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++)
+    {
+        varDsc = compiler->lvaTable + varNum;
+        if (varDsc->lvPromoted)
+        {
+            noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here
+
+            unsigned fieldVarNum = varDsc->lvFieldLclStart;
+            varDsc               = compiler->lvaTable + fieldVarNum;
+        }
+        noway_assert(varDsc->lvIsParam);
+
+        // Skip if arg not passed in a register.
+        if (!varDsc->lvIsRegArg)
+        {
+            continue;
+        }
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (varTypeIsStruct(varDsc))
+        {
+            CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
+            assert(typeHnd != nullptr);
+
+            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+            compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+            assert(structDesc.passedInRegisters);
+
+            unsigned __int8 offset0 = 0;
+            unsigned __int8 offset1 = 0;
+            var_types       type0   = TYP_UNKNOWN;
+            var_types       type1   = TYP_UNKNOWN;
+
+            // Get the eightbyte data
+            compiler->GetStructTypeOffset(structDesc, &type0, &type1, &offset0, &offset1);
+
+            // Move the values into the right registers.
+            //
+
+            // Update varDsc->lvArgReg and lvOtherArgReg life and GC Info to indicate varDsc stack slot is dead and
+            // argReg is going live. Note that we cannot modify varDsc->lvRegNum and lvOtherArgReg here because another
+            // basic block may not be expecting it. Therefore manually update life of argReg.  Note that GT_JMP marks
+            // the end of the basic block and after which reg life and gc info will be recomputed for the new block in
+            // genCodeForBBList().
+            if (type0 != TYP_UNKNOWN)
+            {
+                getEmitter()->emitIns_R_S(ins_Load(type0), emitTypeSize(type0), varDsc->lvArgReg, varNum, offset0);
+                regSet.rsMaskVars |= genRegMask(varDsc->lvArgReg);
+                gcInfo.gcMarkRegPtrVal(varDsc->lvArgReg, type0);
+            }
+
+            if (type1 != TYP_UNKNOWN)
+            {
+                getEmitter()->emitIns_R_S(ins_Load(type1), emitTypeSize(type1), varDsc->lvOtherArgReg, varNum, offset1);
+                regSet.rsMaskVars |= genRegMask(varDsc->lvOtherArgReg);
+                gcInfo.gcMarkRegPtrVal(varDsc->lvOtherArgReg, type1);
+            }
+
+            if (varDsc->lvTracked)
+            {
+                VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+            }
+        }
+        else
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            // Register argument
+            noway_assert(isRegParamType(genActualType(varDsc->TypeGet())));
+
+            // Is register argument already in the right register?
+            // If not load it from its stack location.
+            var_types loadType = varDsc->lvaArgType();
+            regNumber argReg   = varDsc->lvArgReg; // incoming arg register
+
+            if (varDsc->lvRegNum != argReg)
+            {
+                assert(genIsValidReg(argReg));
+                getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
+
+                // Update argReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
+                // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
+                // Therefore manually update life of argReg.  Note that GT_JMP marks the end of the basic block
+                // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
+                regSet.AddMaskVars(genRegMask(argReg));
+                gcInfo.gcMarkRegPtrVal(argReg, loadType);
+                if (compiler->lvaIsGCTracked(varDsc))
+                {
+#ifdef DEBUG
+                    if (VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
+                    {
+                        JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming dead\n", varNum);
+                    }
+                    else
+                    {
+                        JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing dead\n", varNum);
+                    }
+#endif // DEBUG
+
+                    VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+                }
+            }
+        }
+
+#if FEATURE_VARARG && defined(_TARGET_AMD64_)
+        // In case of a jmp call to a vararg method also pass the float/double arg in the corresponding int arg
+        // register. This is due to the AMD64 ABI which requires floating point values passed to varargs functions to
+        // be passed in both integer and floating point registers. It doesn't apply to x86, which passes floating point
+        // values on the stack.
+        if (compiler->info.compIsVarArgs)
+        {
+            regNumber intArgReg;
+            var_types loadType = varDsc->lvaArgType();
+            regNumber argReg   = varDsc->lvArgReg; // incoming arg register
+
+            if (varTypeIsFloating(loadType))
+            {
+                intArgReg       = compiler->getCallArgIntRegister(argReg);
+                instruction ins = ins_CopyFloatToInt(loadType, TYP_LONG);
+                inst_RV_RV(ins, argReg, intArgReg, loadType);
+            }
+            else
+            {
+                intArgReg = argReg;
+            }
+
+            fixedIntArgMask |= genRegMask(intArgReg);
+
+            if (intArgReg == REG_ARG_0)
+            {
+                assert(firstArgVarNum == BAD_VAR_NUM);
+                firstArgVarNum = varNum;
+            }
+        }
+#endif // FEATURE_VARARG
+    }
+
+#if FEATURE_VARARG && defined(_TARGET_AMD64_)
+    // Jmp call to a vararg method - if the method has fewer than 4 fixed arguments,
+    // load the remaining arg registers (both int and float) from the corresponding
+    // shadow stack slots.  This is for the reason that we don't know the number and type
+    // of non-fixed params passed by the caller, therefore we have to assume the worst case
+    // of caller passing float/double args both in int and float arg regs.
+    //
+    // This doesn't apply to x86, which doesn't pass floating point values in floating
+    // point registers.
+    //
+    // The caller could have passed gc-ref/byref type var args.  Since these are var args
+    // the callee no way of knowing their gc-ness.  Therefore, mark the region that loads
+    // remaining arg registers from shadow stack slots as non-gc interruptible.
+    if (fixedIntArgMask != RBM_NONE)
+    {
+        assert(compiler->info.compIsVarArgs);
+        assert(firstArgVarNum != BAD_VAR_NUM);
+
+        regMaskTP remainingIntArgMask = RBM_ARG_REGS & ~fixedIntArgMask;
+        if (remainingIntArgMask != RBM_NONE)
+        {
+            instruction insCopyIntToFloat = ins_CopyIntToFloat(TYP_LONG, TYP_DOUBLE);
+            getEmitter()->emitDisableGC();
+            for (int argNum = 0, argOffset = 0; argNum < MAX_REG_ARG; ++argNum)
+            {
+                regNumber argReg     = intArgRegs[argNum];
+                regMaskTP argRegMask = genRegMask(argReg);
+
+                if ((remainingIntArgMask & argRegMask) != 0)
+                {
+                    remainingIntArgMask &= ~argRegMask;
+                    getEmitter()->emitIns_R_S(INS_mov, EA_8BYTE, argReg, firstArgVarNum, argOffset);
+
+                    // also load it in corresponding float arg reg
+                    regNumber floatReg = compiler->getCallArgFloatRegister(argReg);
+                    inst_RV_RV(insCopyIntToFloat, floatReg, argReg);
+                }
+
+                argOffset += REGSIZE_BYTES;
+            }
+            getEmitter()->emitEnableGC();
+        }
+    }
+#endif // FEATURE_VARARG
+}
+
+// produce code for a GT_LEA subnode
+void CodeGen::genLeaInstruction(GenTreeAddrMode* lea)
+{
+    emitAttr size = emitTypeSize(lea);
+    genConsumeOperands(lea);
+
+    if (lea->Base() && lea->Index())
+    {
+        regNumber baseReg  = lea->Base()->gtRegNum;
+        regNumber indexReg = lea->Index()->gtRegNum;
+        getEmitter()->emitIns_R_ARX(INS_lea, size, lea->gtRegNum, baseReg, indexReg, lea->gtScale, lea->gtOffset);
+    }
+    else if (lea->Base())
+    {
+        getEmitter()->emitIns_R_AR(INS_lea, size, lea->gtRegNum, lea->Base()->gtRegNum, lea->gtOffset);
+    }
+    else if (lea->Index())
+    {
+        getEmitter()->emitIns_R_ARX(INS_lea, size, lea->gtRegNum, REG_NA, lea->Index()->gtRegNum, lea->gtScale,
+                                    lea->gtOffset);
+    }
+
+    genProduceReg(lea);
+}
+
+//-------------------------------------------------------------------------------------------
+// genJumpKindsForTree:  Determine the number and kinds of conditional branches
+//                       necessary to implement the given GT_CMP node
+//
+// Arguments:
+//    cmpTree          - (input) The GenTree node that is used to set the Condition codes
+//                     - The GenTree Relop node that was used to set the Condition codes
+//   jmpKind[2]        - (output) One or two conditional branch instructions
+//   jmpToTrueLabel[2] - (output) When true we branch to the true case
+//                       When false we create a second label and branch to the false case
+//                       Only GT_EQ for a floating point compares can have a false value.
+//
+// Return Value:
+//    Sets the proper values into the array elements of jmpKind[] and jmpToTrueLabel[]
+//
+// Assumptions:
+//    At least one conditional branch instruction will be returned.
+//    Typically only one conditional branch is needed
+//     and the second jmpKind[] value is set to EJ_NONE
+//
+// Notes:
+//    jmpToTrueLabel[i]= true  implies branch when the compare operation is true.
+//    jmpToTrueLabel[i]= false implies branch when the compare operation is false.
+//-------------------------------------------------------------------------------------------
+
+// static
+void CodeGen::genJumpKindsForTree(GenTreePtr cmpTree, emitJumpKind jmpKind[2], bool jmpToTrueLabel[2])
+{
+    // Except for BEQ (=  ordered GT_EQ) both jumps are to the true label.
+    jmpToTrueLabel[0] = true;
+    jmpToTrueLabel[1] = true;
+
+    // For integer comparisons just use genJumpKindForOper
+    if (!varTypeIsFloating(cmpTree->gtOp.gtOp1->gtEffectiveVal()))
+    {
+        CompareKind compareKind = ((cmpTree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
+        jmpKind[0]              = genJumpKindForOper(cmpTree->gtOper, compareKind);
+        jmpKind[1]              = EJ_NONE;
+    }
+    else
+    {
+        assert(cmpTree->OperIsCompare());
+
+        // For details on how we arrived at this mapping, see the comment block in genCodeForTreeNode()
+        // while generating code for compare opererators (e.g. GT_EQ etc).
+        if ((cmpTree->gtFlags & GTF_RELOP_NAN_UN) != 0)
+        {
+            // Must branch if we have an NaN, unordered
+            switch (cmpTree->gtOper)
+            {
+                case GT_LT:
+                case GT_GT:
+                    jmpKind[0] = EJ_jb;
+                    jmpKind[1] = EJ_NONE;
+                    break;
+
+                case GT_LE:
+                case GT_GE:
+                    jmpKind[0] = EJ_jbe;
+                    jmpKind[1] = EJ_NONE;
+                    break;
+
+                case GT_NE:
+                    jmpKind[0] = EJ_jpe;
+                    jmpKind[1] = EJ_jne;
+                    break;
+
+                case GT_EQ:
+                    jmpKind[0] = EJ_je;
+                    jmpKind[1] = EJ_NONE;
+                    break;
+
+                default:
+                    unreached();
+            }
+        }
+        else // ((cmpTree->gtFlags & GTF_RELOP_NAN_UN) == 0)
+        {
+            // Do not branch if we have an NaN, unordered
+            switch (cmpTree->gtOper)
+            {
+                case GT_LT:
+                case GT_GT:
+                    jmpKind[0] = EJ_ja;
+                    jmpKind[1] = EJ_NONE;
+                    break;
+
+                case GT_LE:
+                case GT_GE:
+                    jmpKind[0] = EJ_jae;
+                    jmpKind[1] = EJ_NONE;
+                    break;
+
+                case GT_NE:
+                    jmpKind[0] = EJ_jne;
+                    jmpKind[1] = EJ_NONE;
+                    break;
+
+                case GT_EQ:
+                    jmpKind[0]        = EJ_jpe;
+                    jmpKind[1]        = EJ_je;
+                    jmpToTrueLabel[0] = false;
+                    break;
+
+                default:
+                    unreached();
+            }
+        }
+    }
+}
+
+#if !defined(_TARGET_64BIT_)
+//------------------------------------------------------------------------
+// genJumpKindsForTreeLongHi: Generate the jump types for compare
+// operators of the high parts of a compare with long type operands
+// on x86 for the case where rel-op result needs to be materialized into a
+// register.
+//
+// Arguments:
+//    cmpTree - The GT_CMP node
+//    jmpKind - Return array of jump kinds
+//    jmpToTrueLabel - Return array of if the jump is going to true label
+//
+// Return Value:
+//    None.
+//
+void CodeGen::genJumpKindsForTreeLongHi(GenTreePtr cmpTree, emitJumpKind jmpKind[2])
+{
+    assert(cmpTree->OperIsCompare());
+    CompareKind compareKind = ((cmpTree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
+
+    switch (cmpTree->gtOper)
+    {
+        case GT_LT:
+        case GT_LE:
+            if (compareKind == CK_SIGNED)
+            {
+                jmpKind[0] = EJ_jl;
+                jmpKind[1] = EJ_jg;
+            }
+            else
+            {
+                jmpKind[0] = EJ_jb;
+                jmpKind[1] = EJ_ja;
+            }
+            break;
+
+        case GT_GT:
+        case GT_GE:
+            if (compareKind == CK_SIGNED)
+            {
+                jmpKind[0] = EJ_jg;
+                jmpKind[1] = EJ_jl;
+            }
+            else
+            {
+                jmpKind[0] = EJ_ja;
+                jmpKind[1] = EJ_jb;
+            }
+            break;
+
+        case GT_EQ:
+            // GT_EQ will not jump to the true label if the hi parts are equal
+            jmpKind[0] = EJ_NONE;
+            jmpKind[1] = EJ_jne;
+            break;
+
+        case GT_NE:
+            // GT_NE will always jump to the true label if the high parts are not equal
+            jmpKind[0] = EJ_jne;
+            jmpKind[1] = EJ_NONE;
+            break;
+
+        default:
+            unreached();
+    }
+}
+
+//------------------------------------------------------------------------
+// genCompareLong: Generate code for comparing two longs on x86 when the result of the compare
+// is manifested in a register.
+//
+// Arguments:
+//    treeNode - the compare tree
+//
+// Return Value:
+//    None.
+// Comments:
+// For long compares, we need to compare the high parts of operands first, then the low parts.
+// If the high compare is false, we do not need to compare the low parts. For less than and
+// greater than, if the high compare is true, we can assume the entire compare is true. For
+// compares that are realized in a register, we will generate:
+//
+//    Opcode            x86 equivalent          Comment
+//    ------            --------------          -------
+//    GT_EQ             cmp hiOp1,hiOp2         If any part is not equal, the entire compare
+//                      jne label               is false.
+//                      cmp loOp1,loOp2
+//                      label: sete
+//
+//    GT_NE             cmp hiOp1,hiOp2         If any part is not equal, the entire compare
+//                      jne label               is true.
+//                      cmp loOp1,loOp2
+//                      label: setne
+//
+//    GT_LT; unsigned   cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
+//                      jne label               correctly and we do not need to check lo. Otherwise,
+//                      cmp loOp1,loOp2         we need to compare the lo halves
+//                      label: setb
+//
+//    GT_LE; unsigned   cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
+//                      jne label               correctly and we do not need to check lo. Otherwise,
+//                      cmp loOp1,loOp2         we need to compare the lo halves
+//                      label: setbe
+//
+//    GT_GT; unsigned   cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
+//                      jne label               correctly and we do not need to check lo. Otherwise,
+//                      cmp loOp1,loOp2         we need to compare the lo halves
+//                      label: seta
+//
+//    GT_GE; unsigned   cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
+//                      jne label               correctly and we do not need to check lo. Otherwise,
+//                      cmp loOp1,loOp2         we need to compare the lo halves
+//                      label: setae
+//
+// For signed long comparisons, we need additional labels, as we need to use signed conditions on the
+// "set" instruction:
+//
+//    GT_LT; signed     cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
+//                      jne labelHi             correctly and we do not need to check lo. Otherwise,
+//                      cmp loOp1,loOp2         we need to compare the lo halves
+//                      setb                    Unsigned set for lo compare
+//                      jmp labelFinal
+//                      labelHi: setl           Signed set for high compare
+//                      labelFinal:
+//
+//    GT_LE; signed     cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
+//                      jne labelHi             correctly and we do not need to check lo. Otherwise,
+//                      cmp loOp1,loOp2         we need to compare the lo halves
+//                      setbe                   Unsigend set for lo compare
+//                      jmp labelFinal
+//                      labelHi: setle          Signed set for hi compare
+//                      labelFinal:
+//
+//    GT_GT; signed     cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
+//                      jne labelHi             correctly and we do not need to check lo. Otherwise,
+//                      cmp loOp1,loOp2         we need to compare the lo halves
+//                      seta                    Unsigned set for lo compare
+//                      jmp labelFinal
+//                      labelHi: setg           Signed set for high compare
+//                      labelFinal
+//
+//    GT_GE; signed     cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
+//                      jne labelHi             correctly and we do not need to check lo. Otherwise,
+//                      cmp loOp1,loOp2         we need to compare the lo halves
+//                      setae                   Unsigned set for lo compare
+//                      jmp labelFinal
+//                      labelHi: setge          Signed set for hi compare
+//                      labelFinal:
+//
+// TODO-X86-CQ: Check if hi or lo parts of op2 are 0 and change the compare to a test.
+void CodeGen::genCompareLong(GenTreePtr treeNode)
+{
+    assert(treeNode->OperIsCompare());
+
+    GenTreeOp* tree = treeNode->AsOp();
+    GenTreePtr op1  = tree->gtOp1;
+    GenTreePtr op2  = tree->gtOp2;
+
+    assert(varTypeIsLong(op1->TypeGet()));
+    assert(varTypeIsLong(op2->TypeGet()));
+
+    regNumber targetReg = treeNode->gtRegNum;
+
+    genConsumeOperands(tree);
+
+    assert(targetReg != REG_NA);
+
+    GenTreePtr loOp1 = op1->gtGetOp1();
+    GenTreePtr hiOp1 = op1->gtGetOp2();
+    GenTreePtr loOp2 = op2->gtGetOp1();
+    GenTreePtr hiOp2 = op2->gtGetOp2();
+
+    // Create compare for the high parts
+    instruction ins     = INS_cmp;
+    var_types   cmpType = TYP_INT;
+    emitAttr    cmpAttr = emitTypeSize(cmpType);
+
+    // Emit the compare instruction
+    getEmitter()->emitInsBinary(ins, cmpAttr, hiOp1, hiOp2);
+
+    // Generate the first jump for the high compare
+    CompareKind compareKind = ((tree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
+
+    BasicBlock* labelHi    = genCreateTempLabel();
+    BasicBlock* labelFinal = genCreateTempLabel();
+
+    if (compareKind == CK_SIGNED && (tree->gtOper != GT_NE && tree->gtOper != GT_EQ))
+    {
+        // If we are doing a signed comparison, we need to do a signed set if the high compare is true,
+        // but an unsigned set if we fall through to the low compare. If we have a GT_NE or GT_EQ, we do not
+        // need to worry about the sign of the comparison, so we can use the simplified case.
+
+        // We only have to check for equality for the hi comparison. If they are not equal, then the set will
+        // do the right thing. If they are equal, we have to check the lo halves.
+        inst_JMP(EJ_jne, labelHi);
+
+        // Emit the comparison. Perform the set for the lo. Jump to labelFinal
+        getEmitter()->emitInsBinary(ins, cmpAttr, loOp1, loOp2);
+
+        // The low set must be unsigned
+        emitJumpKind jumpKindLo = genJumpKindForOper(tree->gtOper, CK_UNSIGNED);
+
+        inst_SET(jumpKindLo, targetReg);
+        // Set the higher bytes to 0
+        inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
+        genProduceReg(tree);
+
+        inst_JMP(EJ_jmp, labelFinal);
+
+        // Define the label for hi jump target here. If we have jumped here, we want to set
+        // the target register based on the jump kind of the actual compare type.
+
+        genDefineTempLabel(labelHi);
+        inst_SET(genJumpKindForOper(tree->gtOper, compareKind), targetReg);
+
+        // Set the higher bytes to 0
+        inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
+        genProduceReg(tree);
+
+        genDefineTempLabel(labelFinal);
+    }
+    else
+    {
+        // If the compare is unsigned, or if the sign doesn't change the set instruction, we can use
+        // the same set logic for both the hi and lo compare, so we don't need to jump to a high label,
+        // we can just jump to the set that the lo compare will use.
+
+        // We only have to check for equality for the hi comparison. If they are not equal, then the set will
+        // do the right thing. If they are equal, we have to check the lo halves.
+        inst_JMP(EJ_jne, labelFinal);
+
+        // Emit the comparison
+        getEmitter()->emitInsBinary(ins, cmpAttr, loOp1, loOp2);
+
+        // Define the label for hi jump target here. If we have jumped here, we want to set
+        // the target register based on the jump kind of the lower half (the actual compare
+        // type). If we have fallen through, then we are doing a normal int compare for the
+        // lower parts
+
+        genDefineTempLabel(labelFinal);
+
+        // The low set must be unsigned
+        emitJumpKind jumpKindLo = genJumpKindForOper(tree->gtOper, CK_UNSIGNED);
+
+        inst_SET(jumpKindLo, targetReg);
+        // Set the higher bytes to 0
+        inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
+        genProduceReg(tree);
+    }
+}
+
+//------------------------------------------------------------------------
+// genJTrueLong: Generate code for comparing two longs on x86 for the case where the result
+// is not manifested in a register.
+//
+// Arguments:
+//    treeNode - the compare tree
+//
+// Return Value:
+//    None.
+// Comments:
+// For long compares, we need to compare the high parts of operands first, then the low parts.
+// We only have to do the low compare if the high parts of the operands are equal.
+//
+// In the case where the result of a rel-op is not realized in a register, we generate:
+//
+//    Opcode            x86 equivalent          Comment
+//    ------            --------------          -------
+//
+//    GT_LT; unsigned   cmp hiOp1,hiOp2
+//                      jb  trueLabel
+//                      ja  falseLabel
+//                      cmp loOp1,loOp2
+//                      jb  trueLabel
+//                      falseLabel:
+//
+//    GT_LE; unsigned   cmp hiOp1,hiOp2
+//                      jb  trueLabel
+//                      ja  falseLabel
+//                      cmp loOp1,loOp2
+//                      jbe trueLabel
+//                      falseLabel:
+//
+//    GT_GT; unsigned   cmp hiOp1,hiOp2
+//                      ja  trueLabel
+//                      jb  falseLabel
+//                      cmp loOp1,loOp2
+//                      ja  trueLabel
+//                      falseLabel:
+//
+//    GT_GE; unsigned   cmp hiOp1,hiOp2
+//                      ja  trueLabel
+//                      jb  falseLabel
+//                      cmp loOp1,loOp2
+//                      jae trueLabel
+//                      falseLabel:
+//
+//    GT_LT; signed     cmp hiOp1,hiOp2
+//                      jl  trueLabel
+//                      jg  falseLabel
+//                      cmp loOp1,loOp2
+//                      jb  trueLabel
+//                      falseLabel:
+//
+//    GT_LE; signed     cmp hiOp1,hiOp2
+//                      jl  trueLabel
+//                      jg  falseLabel
+//                      cmp loOp1,loOp2
+//                      jbe trueLabel
+//                      falseLabel:
+//
+//    GT_GT; signed     cmp hiOp1,hiOp2
+//                      jg  trueLabel
+//                      jl  falseLabel
+//                      cmp loOp1,loOp2
+//                      ja  trueLabel
+//                      falseLabel:
+//
+//    GT_GE; signed     cmp hiOp1,hiOp2
+//                      jg  trueLabel
+//                      jl  falseLabel
+//                      cmp loOp1,loOp2
+//                      jae trueLabel
+//                      falseLabel:
+//
+//    GT_EQ;            cmp hiOp1,hiOp2
+//                      jne falseLabel
+//                      cmp loOp1,loOp2
+//                      je  trueLabel
+//                      falseLabel:
+//
+//    GT_NE;            cmp hiOp1,hiOp2
+//                      jne labelTrue
+//                      cmp loOp1,loOp2
+//                      jne trueLabel
+//                      falseLabel:
+//
+// TODO-X86-CQ: Check if hi or lo parts of op2 are 0 and change the compare to a test.
+void CodeGen::genJTrueLong(GenTreePtr treeNode)
+{
+    assert(treeNode->OperIsCompare());
+
+    GenTreeOp* tree = treeNode->AsOp();
+    GenTreePtr op1  = tree->gtOp1;
+    GenTreePtr op2  = tree->gtOp2;
+
+    assert(varTypeIsLong(op1->TypeGet()));
+    assert(varTypeIsLong(op2->TypeGet()));
+
+    regNumber targetReg = treeNode->gtRegNum;
+
+    assert(targetReg == REG_NA);
+
+    GenTreePtr loOp1 = op1->gtGetOp1();
+    GenTreePtr hiOp1 = op1->gtGetOp2();
+    GenTreePtr loOp2 = op2->gtGetOp1();
+    GenTreePtr hiOp2 = op2->gtGetOp2();
+
+    // Emit the compare instruction
+    getEmitter()->emitInsBinary(INS_cmp, EA_4BYTE, hiOp1, hiOp2);
+
+    // Generate the first jump for the high compare
+    CompareKind compareKind = ((tree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
+
+    // TODO-X86-CQ: If the next block is a BBJ_ALWAYS, we can set falseLabel = compiler->compCurBB->bbNext->bbJumpDest.
+    BasicBlock* falseLabel = genCreateTempLabel();
+
+    emitJumpKind jumpKindHi[2];
+
+    // Generate the jumps for the high compare
+    genJumpKindsForTreeLongHi(tree, jumpKindHi);
+
+    BasicBlock* trueLabel = compiler->compCurBB->bbJumpDest;
+
+    if (jumpKindHi[0] != EJ_NONE)
+    {
+        inst_JMP(jumpKindHi[0], trueLabel);
+    }
+
+    if (jumpKindHi[1] != EJ_NONE)
+    {
+        inst_JMP(jumpKindHi[1], falseLabel);
+    }
+
+    // The low jump must be unsigned
+    emitJumpKind jumpKindLo = genJumpKindForOper(tree->gtOper, CK_UNSIGNED);
+
+    // Emit the comparison and the jump to the trueLabel
+    getEmitter()->emitInsBinary(INS_cmp, EA_4BYTE, loOp1, loOp2);
+
+    inst_JMP(jumpKindLo, trueLabel);
+
+    // Generate falseLabel, which is the false path. We will jump here if the high compare is false
+    // or fall through if the low compare is false.
+    genDefineTempLabel(falseLabel);
+}
+#endif //! defined(_TARGET_64BIT_)
+
+//------------------------------------------------------------------------
+// genCompareFloat: Generate code for comparing two floating point values
+//
+// Arguments:
+//    treeNode - the compare tree
+//
+// Return Value:
+//    None.
+// Comments:
+// SSE2 instruction ucomis[s|d] is performs unordered comparison and
+// updates rFLAGS register as follows.
+//        Result of compare         ZF  PF CF
+//        -----------------        ------------
+//        Unordered                 1   1   1     <-- this result implies one of operands of compare is a NAN.
+//        Greater                   0   0   0
+//        Less Than                 0   0   1
+//        Equal                     1   0   0
+//
+// From the above table the following equalities follow. As per ECMA spec *.UN opcodes perform
+// unordered comparison of floating point values.  That is *.UN comparisons result in true when
+// one of the operands is a NaN whereas ordered comparisons results in false.
+//
+//    Opcode          Amd64 equivalent         Comment
+//    ------          -----------------        --------
+//    BLT.UN(a,b)      ucomis[s|d] a, b        Jb branches if CF=1, which means either a<b or unordered from the above
+//                     jb                      table
+//
+//    BLT(a,b)         ucomis[s|d] b, a        Ja branches if CF=0 and ZF=0, which means b>a that in turn implies a<b
+//                     ja
+//
+//    BGT.UN(a,b)      ucomis[s|d] b, a        branch if b<a or unordered ==> branch if a>b or unordered
+//                     jb
+//
+//    BGT(a, b)        ucomis[s|d] a, b        branch if a>b
+//                     ja
+//
+//    BLE.UN(a,b)      ucomis[s|d] a, b        jbe branches if CF=1 or ZF=1, which implies a<=b or unordered
+//                     jbe
+//
+//    BLE(a,b)         ucomis[s|d] b, a        jae branches if CF=0, which mean b>=a or a<=b
+//                     jae
+//
+//    BGE.UN(a,b)      ucomis[s|d] b, a        branch if b<=a or unordered ==> branch if a>=b or unordered
+//                     jbe
+//
+//    BGE(a,b)         ucomis[s|d] a, b        branch if a>=b
+//                     jae
+//
+//    BEQ.UN(a,b)      ucomis[s|d] a, b        branch if a==b or unordered.  There is no BEQ.UN opcode in ECMA spec.
+//                     je                      This case is given for completeness, in case if JIT generates such
+//                                             a gentree internally.
+//
+//    BEQ(a,b)         ucomis[s|d] a, b        From the above table, PF=0 and ZF=1 corresponds to a==b.
+//                     jpe L1
+//                     je <true label>
+//                 L1:
+//
+//    BNE(a,b)         ucomis[s|d] a, b        branch if a!=b.  There is no BNE opcode in ECMA spec. This case is
+//                     jne                     given for completeness, in case if JIT generates such a gentree
+//                                             internally.
+//
+//    BNE.UN(a,b)      ucomis[s|d] a, b        From the above table, PF=1 or ZF=0 implies unordered or a!=b
+//                     jpe <true label>
+//                     jne <true label>
+//
+// As we can see from the above equalities that the operands of a compare operator need to be
+// reveresed in case of BLT/CLT, BGT.UN/CGT.UN, BLE/CLE, BGE.UN/CGE.UN.
+void CodeGen::genCompareFloat(GenTreePtr treeNode)
+{
+    assert(treeNode->OperIsCompare());
+
+    GenTreeOp* tree    = treeNode->AsOp();
+    GenTreePtr op1     = tree->gtOp1;
+    GenTreePtr op2     = tree->gtOp2;
+    var_types  op1Type = op1->TypeGet();
+    var_types  op2Type = op2->TypeGet();
+
+    genConsumeOperands(tree);
+
+    assert(varTypeIsFloating(op1Type));
+    assert(op1Type == op2Type);
+
+    regNumber   targetReg = treeNode->gtRegNum;
+    instruction ins;
+    emitAttr    cmpAttr;
+
+    bool reverseOps;
+    if ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0)
+    {
+        // Unordered comparison case
+        reverseOps = (tree->gtOper == GT_GT || tree->gtOper == GT_GE);
+    }
+    else
+    {
+        reverseOps = (tree->gtOper == GT_LT || tree->gtOper == GT_LE);
+    }
+
+    if (reverseOps)
+    {
+        GenTreePtr tmp = op1;
+        op1            = op2;
+        op2            = tmp;
+    }
+
+    ins     = ins_FloatCompare(op1Type);
+    cmpAttr = emitTypeSize(op1Type);
+
+    getEmitter()->emitInsBinary(ins, cmpAttr, op1, op2);
+
+    // Are we evaluating this into a register?
+    if (targetReg != REG_NA)
+    {
+        genSetRegToCond(targetReg, tree);
+        genProduceReg(tree);
+    }
+}
+
+//------------------------------------------------------------------------
+// genCompareInt: Generate code for comparing ints or, on amd64, longs.
+//
+// Arguments:
+//    treeNode - the compare tree
+//
+// Return Value:
+//    None.
+void CodeGen::genCompareInt(GenTreePtr treeNode)
+{
+    assert(treeNode->OperIsCompare());
+
+    GenTreeOp* tree    = treeNode->AsOp();
+    GenTreePtr op1     = tree->gtOp1;
+    GenTreePtr op2     = tree->gtOp2;
+    var_types  op1Type = op1->TypeGet();
+    var_types  op2Type = op2->TypeGet();
+
+    genConsumeOperands(tree);
+
+    instruction ins;
+    emitAttr    cmpAttr;
+
+    regNumber targetReg = treeNode->gtRegNum;
+    assert(!op1->isContainedIntOrIImmed()); // We no longer support swapping op1 and op2 to generate cmp reg, imm
+    assert(!varTypeIsFloating(op2Type));
+
+#ifdef _TARGET_X86_
+    assert(!varTypeIsLong(op1Type) && !varTypeIsLong(op2Type));
+#endif // _TARGET_X86_
+
+    // By default we use an int32 sized cmp instruction
+    //
+    ins               = INS_cmp;
+    var_types cmpType = TYP_INT;
+
+    // In the if/then/else statement below we may change the
+    // 'cmpType' and/or 'ins' to generate a smaller instruction
+
+    // Are we comparing two values that are the same size?
+    //
+    if (genTypeSize(op1Type) == genTypeSize(op2Type))
+    {
+        if (op1Type == op2Type)
+        {
+            // If both types are exactly the same we can use that type
+            cmpType = op1Type;
+        }
+        else if (genTypeSize(op1Type) == 8)
+        {
+            // If we have two different int64 types we need to use a long compare
+            cmpType = TYP_LONG;
+        }
+
+        cmpAttr = emitTypeSize(cmpType);
+    }
+    else // Here we know that (op1Type != op2Type)
+    {
+        // Do we have a short compare against a constant in op2?
+        //
+        // We checked for this case in LowerCmp() and if we can perform a small
+        // compare immediate we labeled this compare with a GTF_RELOP_SMALL
+        // and for unsigned small non-equality compares the GTF_UNSIGNED flag.
+        //
+        if (op2->isContainedIntOrIImmed() && ((tree->gtFlags & GTF_RELOP_SMALL) != 0))
+        {
+            assert(varTypeIsSmall(op1Type));
+            cmpType = op1Type;
+        }
+#ifdef _TARGET_AMD64_
+        else // compare two different sized operands
+        {
+            // For this case we don't want any memory operands, only registers or immediates
+            //
+            assert(!op1->isContainedMemoryOp());
+            assert(!op2->isContainedMemoryOp());
+
+            // Check for the case where one operand is an int64 type
+            // Lower should have placed 32-bit operand in a register
+            // for signed comparisons we will sign extend the 32-bit value in place.
+            //
+            bool op1Is64Bit = (genTypeSize(op1Type) == 8);
+            bool op2Is64Bit = (genTypeSize(op2Type) == 8);
+            if (op1Is64Bit)
+            {
+                cmpType = TYP_LONG;
+                if (!(tree->gtFlags & GTF_UNSIGNED) && !op2Is64Bit)
+                {
+                    assert(op2->gtRegNum != REG_NA);
+                    inst_RV_RV(INS_movsxd, op2->gtRegNum, op2->gtRegNum, op2Type);
+                }
+            }
+            else if (op2Is64Bit)
+            {
+                cmpType = TYP_LONG;
+                if (!(tree->gtFlags & GTF_UNSIGNED) && !op1Is64Bit)
+                {
+                    assert(op1->gtRegNum != REG_NA);
+                }
+            }
+        }
+#endif // _TARGET_AMD64_
+
+        cmpAttr = emitTypeSize(cmpType);
+    }
+
+    // See if we can generate a "test" instruction instead of a "cmp".
+    // For this to generate the correct conditional branch we must have
+    // a compare against zero.
+    //
+    if (op2->IsIntegralConst(0))
+    {
+        if (op1->isContained())
+        {
+            // op1 can be a contained memory op
+            // or the special contained GT_AND that we created in Lowering::LowerCmp()
+            //
+            if ((op1->OperGet() == GT_AND))
+            {
+                noway_assert(op1->gtOp.gtOp2->isContainedIntOrIImmed());
+
+                ins = INS_test;        // we will generate "test andOp1, andOp2CnsVal"
+                op2 = op1->gtOp.gtOp2; // must assign op2 before we overwrite op1
+                op1 = op1->gtOp.gtOp1; // overwrite op1
+
+                if (op1->isContainedMemoryOp())
+                {
+                    // use the size andOp1 if it is a contained memoryop.
+                    cmpAttr = emitTypeSize(op1->TypeGet());
+                }
+                // fallthrough to emit->emitInsBinary(ins, cmpAttr, op1, op2);
+            }
+        }
+        else // op1 is not contained thus it must be in a register
+        {
+            ins = INS_test;
+            op2 = op1; // we will generate "test reg1,reg1"
+            // fallthrough to emit->emitInsBinary(ins, cmpAttr, op1, op2);
+        }
+    }
+
+    getEmitter()->emitInsBinary(ins, cmpAttr, op1, op2);
+
+    // Are we evaluating this into a register?
+    if (targetReg != REG_NA)
+    {
+        genSetRegToCond(targetReg, tree);
+        genProduceReg(tree);
+    }
+}
+
+//-------------------------------------------------------------------------------------------
+// genSetRegToCond:  Set a register 'dstReg' to the appropriate one or zero value
+//                   corresponding to a binary Relational operator result.
+//
+// Arguments:
+//   dstReg          - The target register to set to 1 or 0
+//   tree            - The GenTree Relop node that was used to set the Condition codes
+//
+// Return Value:     none
+//
+// Notes:
+//    A full 64-bit value of either 1 or 0 is setup in the 'dstReg'
+//-------------------------------------------------------------------------------------------
+
+void CodeGen::genSetRegToCond(regNumber dstReg, GenTreePtr tree)
+{
+    noway_assert((genRegMask(dstReg) & RBM_BYTE_REGS) != 0);
+
+    emitJumpKind jumpKind[2];
+    bool         branchToTrueLabel[2];
+    genJumpKindsForTree(tree, jumpKind, branchToTrueLabel);
+
+    if (jumpKind[1] == EJ_NONE)
+    {
+        // Set (lower byte of) reg according to the flags
+        inst_SET(jumpKind[0], dstReg);
+    }
+    else
+    {
+#ifdef DEBUG
+        // jmpKind[1] != EJ_NONE implies BEQ and BEN.UN of floating point values.
+        // These are represented by two conditions.
+        if (tree->gtOper == GT_EQ)
+        {
+            // This must be an ordered comparison.
+            assert((tree->gtFlags & GTF_RELOP_NAN_UN) == 0);
+        }
+        else
+        {
+            // This must be BNE.UN
+            assert((tree->gtOper == GT_NE) && ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0));
+        }
+#endif
+
+        // Here is the sample code generated in each case:
+        // BEQ ==  cmp, jpe <false label>, je <true label>
+        // That is, to materialize comparison reg needs to be set if PF=0 and ZF=1
+        //      setnp reg  // if (PF==0) reg = 1 else reg = 0
+        //      jpe L1     // Jmp if PF==1
+        //      sete reg
+        //  L1:
+        //
+        // BNE.UN == cmp, jpe <true label>, jne <true label>
+        // That is, to materialize the comparison reg needs to be set if either PF=1 or ZF=0;
+        //     setp reg
+        //     jpe L1
+        //     setne reg
+        //  L1:
+
+        // reverse the jmpkind condition before setting dstReg if it is to false label.
+        inst_SET(branchToTrueLabel[0] ? jumpKind[0] : emitter::emitReverseJumpKind(jumpKind[0]), dstReg);
+
+        BasicBlock* label = genCreateTempLabel();
+        inst_JMP(jumpKind[0], label);
+
+        // second branch is always to true label
+        assert(branchToTrueLabel[1]);
+        inst_SET(jumpKind[1], dstReg);
+        genDefineTempLabel(label);
+    }
+
+    var_types treeType = tree->TypeGet();
+    if (treeType == TYP_INT || treeType == TYP_LONG)
+    {
+        // Set the higher bytes to 0
+        inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), dstReg, dstReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
+    }
+    else
+    {
+        noway_assert(treeType == TYP_BYTE);
+    }
+}
+
+//------------------------------------------------------------------------
+// genIntToIntCast: Generate code for an integer cast
+//    This method handles integer overflow checking casts
+//    as well as ordinary integer casts.
+//
+// Arguments:
+//    treeNode - The GT_CAST node
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    The treeNode is not a contained node and must have an assigned register.
+//    For a signed convert from byte, the source must be in a byte-addressable register.
+//    Neither the source nor target type can be a floating point type.
+//
+// TODO-XArch-CQ: Allow castOp to be a contained node without an assigned register.
+// TODO: refactor to use getCastDescription
+//
+void CodeGen::genIntToIntCast(GenTreePtr treeNode)
+{
+    assert(treeNode->OperGet() == GT_CAST);
+
+    GenTreePtr castOp        = treeNode->gtCast.CastOp();
+    regNumber  targetReg     = treeNode->gtRegNum;
+    regNumber  sourceReg     = castOp->gtRegNum;
+    var_types  dstType       = treeNode->CastToType();
+    bool       isUnsignedDst = varTypeIsUnsigned(dstType);
+    var_types  srcType       = genActualType(castOp->TypeGet());
+    bool       isUnsignedSrc = varTypeIsUnsigned(srcType);
+
+    // if necessary, force the srcType to unsigned when the GT_UNSIGNED flag is set
+    if (!isUnsignedSrc && (treeNode->gtFlags & GTF_UNSIGNED) != 0)
+    {
+        srcType       = genUnsignedType(srcType);
+        isUnsignedSrc = true;
+    }
+
+    bool requiresOverflowCheck = false;
+    bool needAndAfter          = false;
+
+    assert(genIsValidIntReg(targetReg));
+    assert(genIsValidIntReg(sourceReg));
+
+    instruction ins  = INS_invalid;
+    emitAttr    size = EA_UNKNOWN;
+
+    if (genTypeSize(srcType) < genTypeSize(dstType))
+    {
+        // Widening cast
+
+        // Is this an Overflow checking cast?
+        // We only need to handle one case, as the other casts can never overflow.
+        //   cast from TYP_INT to TYP_ULONG
+        //
+        if (treeNode->gtOverflow() && (srcType == TYP_INT) && (dstType == TYP_ULONG))
+        {
+            requiresOverflowCheck = true;
+            size                  = EA_ATTR(genTypeSize(srcType));
+            ins                   = INS_mov;
+        }
+        else
+        {
+            // we need the source size
+            size = EA_ATTR(genTypeSize(srcType));
+            noway_assert(size < EA_PTRSIZE);
+
+            ins = ins_Move_Extend(srcType, castOp->InReg());
+
+            /*
+                Special case: ins_Move_Extend assumes the destination type is no bigger
+                than TYP_INT.  movsx and movzx can already extend all the way to
+                64-bit, and a regular 32-bit mov clears the high 32 bits (like the non-existant movzxd),
+                but for a sign extension from TYP_INT to TYP_LONG, we need to use movsxd opcode.
+            */
+            if (!isUnsignedSrc && !isUnsignedDst && (size == EA_4BYTE) && (genTypeSize(dstType) > EA_4BYTE))
+            {
+#ifdef _TARGET_X86_
+                NYI_X86("Cast to 64 bit for x86/RyuJIT");
+#else  // !_TARGET_X86_
+                ins = INS_movsxd;
+#endif // !_TARGET_X86_
+            }
+
+            /*
+                Special case: for a cast of byte to char we first
+                have to expand the byte (w/ sign extension), then
+                mask off the high bits.
+                Use 'movsx' followed by 'and'
+            */
+            if (!isUnsignedSrc && isUnsignedDst && (genTypeSize(dstType) < EA_4BYTE))
+            {
+                noway_assert(genTypeSize(dstType) == EA_2BYTE && size == EA_1BYTE);
+                needAndAfter = true;
+            }
+        }
+    }
+    else
+    {
+        // Narrowing cast, or sign-changing cast
+        noway_assert(genTypeSize(srcType) >= genTypeSize(dstType));
+
+        // Is this an Overflow checking cast?
+        if (treeNode->gtOverflow())
+        {
+            requiresOverflowCheck = true;
+            size                  = EA_ATTR(genTypeSize(srcType));
+            ins                   = INS_mov;
+        }
+        else
+        {
+            size = EA_ATTR(genTypeSize(dstType));
+            ins  = ins_Move_Extend(dstType, castOp->InReg());
+        }
+    }
+
+    noway_assert(ins != INS_invalid);
+
+    genConsumeReg(castOp);
+
+    if (requiresOverflowCheck)
+    {
+        ssize_t typeMin        = 0;
+        ssize_t typeMax        = 0;
+        ssize_t typeMask       = 0;
+        bool    needScratchReg = false;
+        bool    signCheckOnly  = false;
+
+        /* Do we need to compare the value, or just check masks */
+
+        switch (dstType)
+        {
+            case TYP_BYTE:
+                typeMask = ssize_t((int)0xFFFFFF80);
+                typeMin  = SCHAR_MIN;
+                typeMax  = SCHAR_MAX;
+                break;
+
+            case TYP_UBYTE:
+                typeMask = ssize_t((int)0xFFFFFF00L);
+                break;
+
+            case TYP_SHORT:
+                typeMask = ssize_t((int)0xFFFF8000);
+                typeMin  = SHRT_MIN;
+                typeMax  = SHRT_MAX;
+                break;
+
+            case TYP_CHAR:
+                typeMask = ssize_t((int)0xFFFF0000L);
+                break;
+
+            case TYP_INT:
+                if (srcType == TYP_UINT)
+                {
+                    signCheckOnly = true;
+                }
+                else
+                {
+                    typeMask = 0xFFFFFFFF80000000LL;
+                    typeMin  = INT_MIN;
+                    typeMax  = INT_MAX;
+                }
+                break;
+
+            case TYP_UINT:
+                if (srcType == TYP_INT)
+                {
+                    signCheckOnly = true;
+                }
+                else
+                {
+                    needScratchReg = true;
+                }
+                break;
+
+            case TYP_LONG:
+                noway_assert(srcType == TYP_ULONG);
+                signCheckOnly = true;
+                break;
+
+            case TYP_ULONG:
+                noway_assert((srcType == TYP_LONG) || (srcType == TYP_INT));
+                signCheckOnly = true;
+                break;
+
+            default:
+                NO_WAY("Unknown type");
+                return;
+        }
+
+        if (signCheckOnly)
+        {
+            // We only need to check for a negative value in sourceReg
+            inst_RV_IV(INS_cmp, sourceReg, 0, size);
+            genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
+        }
+        else
+        {
+            regNumber tmpReg = REG_NA;
+
+            if (needScratchReg)
+            {
+                // We need an additional temp register
+                // Make sure we have exactly one allocated.
+                assert(treeNode->gtRsvdRegs != RBM_NONE);
+                assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+                tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+            }
+
+            // When we are converting from unsigned or to unsigned, we
+            // will only have to check for any bits set using 'typeMask'
+            if (isUnsignedSrc || isUnsignedDst)
+            {
+                if (needScratchReg)
+                {
+                    inst_RV_RV(INS_mov, tmpReg, sourceReg, TYP_LONG); // Move the 64-bit value to a writeable temp reg
+                    inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, size, tmpReg, 32); // Shift right by 32 bits
+                    genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);            // Thow if result shift is non-zero
+                }
+                else
+                {
+                    noway_assert(typeMask != 0);
+                    inst_RV_IV(INS_TEST, sourceReg, typeMask, size);
+                    genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
+                }
+            }
+            else
+            {
+                // For a narrowing signed cast
+                //
+                // We must check the value is in a signed range.
+
+                // Compare with the MAX
+
+                noway_assert((typeMin != 0) && (typeMax != 0));
+
+                inst_RV_IV(INS_cmp, sourceReg, typeMax, size);
+                genJumpToThrowHlpBlk(EJ_jg, SCK_OVERFLOW);
+
+                // Compare with the MIN
+
+                inst_RV_IV(INS_cmp, sourceReg, typeMin, size);
+                genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
+            }
+        }
+
+        if (targetReg != sourceReg
+#ifdef _TARGET_AMD64_
+            // On amd64, we can hit this path for a same-register
+            // 4-byte to 8-byte widening conversion, and need to
+            // emit the instruction to set the high bits correctly.
+            || (EA_ATTR(genTypeSize(dstType)) == EA_8BYTE && EA_ATTR(genTypeSize(srcType)) == EA_4BYTE)
+#endif // _TARGET_AMD64_
+                )
+            inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
+    }
+    else // non-overflow checking cast
+    {
+        noway_assert(size < EA_PTRSIZE || srcType == dstType);
+
+        // We may have code transformations that result in casts where srcType is the same as dstType.
+        // e.g. Bug 824281, in which a comma is split by the rationalizer, leaving an assignment of a
+        // long constant to a long lclVar.
+        if (srcType == dstType)
+        {
+            ins = INS_mov;
+        }
+        /* Is the value sitting in a non-byte-addressable register? */
+        else if (castOp->InReg() && (size == EA_1BYTE) && !isByteReg(sourceReg))
+        {
+            if (isUnsignedDst)
+            {
+                // for unsigned values we can AND, so it need not be a byte register
+                ins = INS_AND;
+            }
+            else
+            {
+                // Move the value into a byte register
+                noway_assert(!"Signed byte convert from non-byte-addressable register");
+            }
+
+            /* Generate "mov targetReg, castOp->gtReg */
+            if (targetReg != sourceReg)
+            {
+                inst_RV_RV(INS_mov, targetReg, sourceReg, srcType);
+            }
+        }
+
+        if (ins == INS_AND)
+        {
+            noway_assert((needAndAfter == false) && isUnsignedDst);
+
+            /* Generate "and reg, MASK */
+            unsigned fillPattern;
+            if (size == EA_1BYTE)
+            {
+                fillPattern = 0xff;
+            }
+            else if (size == EA_2BYTE)
+            {
+                fillPattern = 0xffff;
+            }
+            else
+            {
+                fillPattern = 0xffffffff;
+            }
+
+            inst_RV_IV(INS_AND, targetReg, fillPattern, EA_4BYTE);
+        }
+#ifdef _TARGET_AMD64_
+        else if (ins == INS_movsxd)
+        {
+            noway_assert(!needAndAfter);
+            inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
+        }
+#endif // _TARGET_AMD64_
+        else if (ins == INS_mov)
+        {
+            noway_assert(!needAndAfter);
+            if (targetReg != sourceReg
+#ifdef _TARGET_AMD64_
+                // On amd64, 'mov' is the opcode used to zero-extend from
+                // 4 bytes to 8 bytes.
+                || (EA_ATTR(genTypeSize(dstType)) == EA_8BYTE && EA_ATTR(genTypeSize(srcType)) == EA_4BYTE)
+#endif // _TARGET_AMD64_
+                    )
+            {
+                inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
+            }
+        }
+        else
+        {
+            noway_assert(ins == INS_movsx || ins == INS_movzx);
+
+            /* Generate "mov targetReg, castOp->gtReg */
+            inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
+
+            /* Mask off high bits for cast from byte to char */
+            if (needAndAfter)
+            {
+                noway_assert(genTypeSize(dstType) == 2 && ins == INS_movsx);
+                inst_RV_IV(INS_AND, targetReg, 0xFFFF, EA_4BYTE);
+            }
+        }
+    }
+
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genFloatToFloatCast: Generate code for a cast between float and double
+//
+// Arguments:
+//    treeNode - The GT_CAST node
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    Cast is a non-overflow conversion.
+//    The treeNode must have an assigned register.
+//    The cast is between float and double or vice versa.
+//
+void CodeGen::genFloatToFloatCast(GenTreePtr treeNode)
+{
+    // float <--> double conversions are always non-overflow ones
+    assert(treeNode->OperGet() == GT_CAST);
+    assert(!treeNode->gtOverflow());
+
+    regNumber targetReg = treeNode->gtRegNum;
+    assert(genIsValidFloatReg(targetReg));
+
+    GenTreePtr op1 = treeNode->gtOp.gtOp1;
+#ifdef DEBUG
+    // If not contained, must be a valid float reg.
+    if (!op1->isContained())
+    {
+        assert(genIsValidFloatReg(op1->gtRegNum));
+    }
+#endif
+
+    var_types dstType = treeNode->CastToType();
+    var_types srcType = op1->TypeGet();
+    assert(varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
+
+    genConsumeOperands(treeNode->AsOp());
+    if (srcType == dstType && targetReg == op1->gtRegNum)
+    {
+        // source and destinations types are the same and also reside in the same register.
+        // we just need to consume and produce the reg in this case.
+        ;
+    }
+    else
+    {
+        instruction ins = ins_FloatConv(dstType, srcType);
+        getEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
+    }
+
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genIntToFloatCast: Generate code to cast an int/long to float/double
+//
+// Arguments:
+//    treeNode - The GT_CAST node
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    Cast is a non-overflow conversion.
+//    The treeNode must have an assigned register.
+//    SrcType= int32/uint32/int64/uint64 and DstType=float/double.
+//
+void CodeGen::genIntToFloatCast(GenTreePtr treeNode)
+{
+    // int type --> float/double conversions are always non-overflow ones
+    assert(treeNode->OperGet() == GT_CAST);
+    assert(!treeNode->gtOverflow());
+
+    regNumber targetReg = treeNode->gtRegNum;
+    assert(genIsValidFloatReg(targetReg));
+
+    GenTreePtr op1 = treeNode->gtOp.gtOp1;
+#ifdef DEBUG
+    if (!op1->isContained())
+    {
+        assert(genIsValidIntReg(op1->gtRegNum));
+    }
+#endif
+
+    var_types dstType = treeNode->CastToType();
+    var_types srcType = op1->TypeGet();
+    assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
+
+#if !defined(_TARGET_64BIT_)
+    NYI_IF(varTypeIsLong(srcType), "Conversion from long to float");
+#endif // !defined(_TARGET_64BIT_)
+
+    // Since xarch emitter doesn't handle reporting gc-info correctly while casting away gc-ness we
+    // ensure srcType of a cast is non gc-type.  Codegen should never see BYREF as source type except
+    // for GT_LCL_VAR_ADDR and GT_LCL_FLD_ADDR that represent stack addresses and can be considered
+    // as TYP_I_IMPL. In all other cases where src operand is a gc-type and not known to be on stack,
+    // Front-end (see fgMorphCast()) ensures this by assigning gc-type local to a non gc-type
+    // temp and using temp as operand of cast operation.
+    if (srcType == TYP_BYREF)
+    {
+        noway_assert(op1->OperGet() == GT_LCL_VAR_ADDR || op1->OperGet() == GT_LCL_FLD_ADDR);
+        srcType = TYP_I_IMPL;
+    }
+
+    // force the srcType to unsigned if GT_UNSIGNED flag is set
+    if (treeNode->gtFlags & GTF_UNSIGNED)
+    {
+        srcType = genUnsignedType(srcType);
+    }
+
+    noway_assert(!varTypeIsGC(srcType));
+
+    // We should never be seeing srcType whose size is not sizeof(int) nor sizeof(long).
+    // For conversions from byte/sbyte/int16/uint16 to float/double, we would expect
+    // either the front-end or lowering phase to have generated two levels of cast.
+    // The first one is for widening smaller int type to int32 and the second one is
+    // to the float/double.
+    emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
+    noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG))));
+
+    // Also we don't expect to see uint32 -> float/double and uint64 -> float conversions
+    // here since they should have been lowered apropriately.
+    noway_assert(srcType != TYP_UINT);
+    noway_assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT));
+
+    // To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used
+    // which does a partial write to lower 4/8 bytes of xmm register keeping the other
+    // upper bytes unmodified.  If "cvtsi2ss/sd xmmReg, r32/r64" occurs inside a loop,
+    // the partial write could introduce a false dependency and could cause a stall
+    // if there are further uses of xmmReg. We have such a case occuring with a
+    // customer reported version of SpectralNorm benchmark, resulting in 2x perf
+    // regression.  To avoid false dependency, we emit "xorps xmmReg, xmmReg" before
+    // cvtsi2ss/sd instruction.
+
+    genConsumeOperands(treeNode->AsOp());
+    getEmitter()->emitIns_R_R(INS_xorps, EA_4BYTE, treeNode->gtRegNum, treeNode->gtRegNum);
+
+    // Note that here we need to specify srcType that will determine
+    // the size of source reg/mem operand and rex.w prefix.
+    instruction ins = ins_FloatConv(dstType, TYP_INT);
+    getEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
+
+    // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction
+    // will interpret ULONG value as LONG.  Hence we need to adjust the
+    // result if sign-bit of srcType is set.
+    if (srcType == TYP_ULONG)
+    {
+        // The instruction sequence below is less accurate than what clang
+        // and gcc generate. However, we keep the current sequence for backward compatiblity.
+        // If we change the instructions below, FloatingPointUtils::convertUInt64ToDobule
+        // should be also updated for consistent conversion result.
+        assert(dstType == TYP_DOUBLE);
+        assert(!op1->isContained());
+
+        // Set the flags without modifying op1.
+        // test op1Reg, op1Reg
+        inst_RV_RV(INS_test, op1->gtRegNum, op1->gtRegNum, srcType);
+
+        // No need to adjust result if op1 >= 0 i.e. positive
+        // Jge label
+        BasicBlock* label = genCreateTempLabel();
+        inst_JMP(EJ_jge, label);
+
+        // Adjust the result
+        // result = result + 0x43f00000 00000000
+        // addsd resultReg,  0x43f00000 00000000
+        GenTreePtr* cns = &u8ToDblBitmask;
+        if (*cns == nullptr)
+        {
+            double d;
+            static_assert_no_msg(sizeof(double) == sizeof(__int64));
+            *((__int64*)&d) = 0x43f0000000000000LL;
+
+            *cns = genMakeConst(&d, dstType, treeNode, true);
+        }
+        inst_RV_TT(INS_addsd, treeNode->gtRegNum, *cns);
+
+        genDefineTempLabel(label);
+    }
+
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genFloatToIntCast: Generate code to cast float/double to int/long
+//
+// Arguments:
+//    treeNode - The GT_CAST node
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    Cast is a non-overflow conversion.
+//    The treeNode must have an assigned register.
+//    SrcType=float/double and DstType= int32/uint32/int64/uint64
+//
+// TODO-XArch-CQ: (Low-pri) - generate in-line code when DstType = uint64
+//
+void CodeGen::genFloatToIntCast(GenTreePtr treeNode)
+{
+    // we don't expect to see overflow detecting float/double --> int type conversions here
+    // as they should have been converted into helper calls by front-end.
+    assert(treeNode->OperGet() == GT_CAST);
+    assert(!treeNode->gtOverflow());
+
+    regNumber targetReg = treeNode->gtRegNum;
+    assert(genIsValidIntReg(targetReg));
+
+    GenTreePtr op1 = treeNode->gtOp.gtOp1;
+#ifdef DEBUG
+    if (!op1->isContained())
+    {
+        assert(genIsValidFloatReg(op1->gtRegNum));
+    }
+#endif
+
+    var_types dstType = treeNode->CastToType();
+    var_types srcType = op1->TypeGet();
+    assert(varTypeIsFloating(srcType) && !varTypeIsFloating(dstType));
+
+    // We should never be seeing dstType whose size is neither sizeof(TYP_INT) nor sizeof(TYP_LONG).
+    // For conversions to byte/sbyte/int16/uint16 from float/double, we would expect the
+    // front-end or lowering phase to have generated two levels of cast. The first one is
+    // for float or double to int32/uint32 and the second one for narrowing int32/uint32 to
+    // the required smaller int type.
+    emitAttr dstSize = EA_ATTR(genTypeSize(dstType));
+    noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) || (dstSize == EA_ATTR(genTypeSize(TYP_LONG))));
+
+    // We shouldn't be seeing uint64 here as it should have been converted
+    // into a helper call by either front-end or lowering phase.
+    noway_assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))));
+
+    // If the dstType is TYP_UINT, we have 32-bits to encode the
+    // float number. Any of 33rd or above bits can be the sign bit.
+    // To acheive it we pretend as if we are converting it to a long.
+    if (varTypeIsUnsigned(dstType) && (dstSize == EA_ATTR(genTypeSize(TYP_INT))))
+    {
+        dstType = TYP_LONG;
+    }
+
+    // Note that we need to specify dstType here so that it will determine
+    // the size of destination integer register and also the rex.w prefix.
+    genConsumeOperands(treeNode->AsOp());
+    instruction ins = ins_FloatConv(TYP_INT, srcType);
+    getEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genCkfinite: Generate code for ckfinite opcode.
+//
+// Arguments:
+//    treeNode - The GT_CKFINITE node
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    GT_CKFINITE node has reserved an internal register.
+//
+// TODO-XArch-CQ - mark the operand as contained if known to be in
+// memory (e.g. field or an array element).
+//
+void CodeGen::genCkfinite(GenTreePtr treeNode)
+{
+    assert(treeNode->OperGet() == GT_CKFINITE);
+
+    GenTreePtr op1        = treeNode->gtOp.gtOp1;
+    var_types  targetType = treeNode->TypeGet();
+    int        expMask    = (targetType == TYP_FLOAT) ? 0x7F800000 : 0x7FF00000; // Bit mask to extract exponent.
+    regNumber  targetReg  = treeNode->gtRegNum;
+
+    // Extract exponent into a register.
+    assert(treeNode->gtRsvdRegs != RBM_NONE);
+    assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+    regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+
+    genConsumeReg(op1);
+
+#ifdef _TARGET_64BIT_
+
+    // Copy the floating-point value to an integer register. If we copied a float to a long, then
+    // right-shift the value so the high 32 bits of the floating-point value sit in the low 32
+    // bits of the integer register.
+    instruction ins = ins_CopyFloatToInt(targetType, (targetType == TYP_FLOAT) ? TYP_INT : TYP_LONG);
+    inst_RV_RV(ins, op1->gtRegNum, tmpReg, targetType);
+    if (targetType == TYP_DOUBLE)
+    {
+        // right shift by 32 bits to get to exponent.
+        inst_RV_SH(INS_shr, EA_8BYTE, tmpReg, 32);
+    }
+
+    // Mask exponent with all 1's and check if the exponent is all 1's
+    inst_RV_IV(INS_and, tmpReg, expMask, EA_4BYTE);
+    inst_RV_IV(INS_cmp, tmpReg, expMask, EA_4BYTE);
+
+    // If exponent is all 1's, throw ArithmeticException
+    genJumpToThrowHlpBlk(EJ_je, SCK_ARITH_EXCPN);
+
+    // if it is a finite value copy it to targetReg
+    if (targetReg != op1->gtRegNum)
+    {
+        inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
+    }
+
+#else // !_TARGET_64BIT_
+
+    // If the target type is TYP_DOUBLE, we want to extract the high 32 bits into the register.
+    // There is no easy way to do this. To not require an extra register, we'll use shuffles
+    // to move the high 32 bits into the low 32 bits, then then shuffle it back, since we
+    // need to produce the value into the target register.
+    //
+    // For TYP_DOUBLE, we'll generate (for targetReg != op1->gtRegNum):
+    //    movaps targetReg, op1->gtRegNum
+    //    shufps targetReg, targetReg, 0xB1	// WZYX => ZWXY
+    //    mov_xmm2i tmpReg, targetReg		// tmpReg <= Y
+    //    and tmpReg, <mask>
+    //    cmp tmpReg, <mask>
+    //    je <throw block>
+    //    movaps targetReg, op1->gtRegNum   // copy the value again, instead of un-shuffling it
+    //
+    // For TYP_DOUBLE with (targetReg == op1->gtRegNum):
+    //    shufps targetReg, targetReg, 0xB1	// WZYX => ZWXY
+    //    mov_xmm2i tmpReg, targetReg		// tmpReg <= Y
+    //    and tmpReg, <mask>
+    //    cmp tmpReg, <mask>
+    //    je <throw block>
+    //    shufps targetReg, targetReg, 0xB1	// ZWXY => WZYX
+    //
+    // For TYP_FLOAT, it's the same as _TARGET_64BIT_:
+    //    mov_xmm2i tmpReg, targetReg		// tmpReg <= low 32 bits
+    //    and tmpReg, <mask>
+    //    cmp tmpReg, <mask>
+    //    je <throw block>
+    //    movaps targetReg, op1->gtRegNum   // only if targetReg != op1->gtRegNum
+
+    regNumber copyToTmpSrcReg; // The register we'll copy to the integer temp.
+
+    if (targetType == TYP_DOUBLE)
+    {
+        if (targetReg != op1->gtRegNum)
+        {
+            inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
+        }
+        inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, targetReg, 0xb1);
+        copyToTmpSrcReg = targetReg;
+    }
+    else
+    {
+        copyToTmpSrcReg = op1->gtRegNum;
+    }
+
+    // Copy only the low 32 bits. This will be the high order 32 bits of the floating-point
+    // value, no matter the floating-point type.
+    inst_RV_RV(ins_CopyFloatToInt(TYP_FLOAT, TYP_INT), copyToTmpSrcReg, tmpReg, TYP_FLOAT);
+
+    // Mask exponent with all 1's and check if the exponent is all 1's
+    inst_RV_IV(INS_and, tmpReg, expMask, EA_4BYTE);
+    inst_RV_IV(INS_cmp, tmpReg, expMask, EA_4BYTE);
+
+    // If exponent is all 1's, throw ArithmeticException
+    genJumpToThrowHlpBlk(EJ_je, SCK_ARITH_EXCPN);
+
+    if (targetReg != op1->gtRegNum)
+    {
+        // In both the TYP_FLOAT and TYP_DOUBLE case, the op1 register is untouched,
+        // so copy it to the targetReg. This is faster and smaller for TYP_DOUBLE
+        // than re-shuffling the targetReg.
+        inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
+    }
+    else if (targetType == TYP_DOUBLE)
+    {
+        // We need to re-shuffle the targetReg to get the correct result.
+        inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, targetReg, 0xb1);
+    }
+
+#endif // !_TARGET_64BIT_
+
+    genProduceReg(treeNode);
+}
+
+#ifdef _TARGET_AMD64_
+int CodeGenInterface::genSPtoFPdelta()
+{
+    int delta;
+
+#ifdef PLATFORM_UNIX
+
+    // We require frame chaining on Unix to support native tool unwinding (such as
+    // unwinding by the native debugger). We have a CLR-only extension to the
+    // unwind codes (UWOP_SET_FPREG_LARGE) to support SP->FP offsets larger than 240.
+    // If Unix ever supports EnC, the RSP == RBP assumption will have to be reevaluated.
+    delta = genTotalFrameSize();
+
+#else // !PLATFORM_UNIX
+
+    // As per Amd64 ABI, RBP offset from initial RSP can be between 0 and 240 if
+    // RBP needs to be reported in unwind codes.  This case would arise for methods
+    // with localloc.
+    if (compiler->compLocallocUsed)
+    {
+        // We cannot base delta computation on compLclFrameSize since it changes from
+        // tentative to final frame layout and hence there is a possibility of
+        // under-estimating offset of vars from FP, which in turn results in under-
+        // estimating instruction size.
+        //
+        // To be predictive and so as never to under-estimate offset of vars from FP
+        // we will always position FP at min(240, outgoing arg area size).
+        delta = Min(240, (int)compiler->lvaOutgoingArgSpaceSize);
+    }
+    else if (compiler->opts.compDbgEnC)
+    {
+        // vm assumption on EnC methods is that rsp and rbp are equal
+        delta = 0;
+    }
+    else
+    {
+        delta = genTotalFrameSize();
+    }
+
+#endif // !PLATFORM_UNIX
+
+    return delta;
+}
+
+//---------------------------------------------------------------------
+// genTotalFrameSize - return the total size of the stack frame, including local size,
+// callee-saved register size, etc. For AMD64, this does not include the caller-pushed
+// return address.
+//
+// Return value:
+//    Total frame size
+//
+
+int CodeGenInterface::genTotalFrameSize()
+{
+    assert(!IsUninitialized(compiler->compCalleeRegsPushed));
+
+    int totalFrameSize = compiler->compCalleeRegsPushed * REGSIZE_BYTES + compiler->compLclFrameSize;
+
+    assert(totalFrameSize >= 0);
+    return totalFrameSize;
+}
+
+//---------------------------------------------------------------------
+// genCallerSPtoFPdelta - return the offset from Caller-SP to the frame pointer.
+// This number is going to be negative, since the Caller-SP is at a higher
+// address than the frame pointer.
+//
+// There must be a frame pointer to call this function!
+//
+// We can't compute this directly from the Caller-SP, since the frame pointer
+// is based on a maximum delta from Initial-SP, so first we find SP, then
+// compute the FP offset.
+
+int CodeGenInterface::genCallerSPtoFPdelta()
+{
+    assert(isFramePointerUsed());
+    int callerSPtoFPdelta;
+
+    callerSPtoFPdelta = genCallerSPtoInitialSPdelta() + genSPtoFPdelta();
+
+    assert(callerSPtoFPdelta <= 0);
+    return callerSPtoFPdelta;
+}
+
+//---------------------------------------------------------------------
+// genCallerSPtoInitialSPdelta - return the offset from Caller-SP to Initial SP.
+//
+// This number will be negative.
+
+int CodeGenInterface::genCallerSPtoInitialSPdelta()
+{
+    int callerSPtoSPdelta = 0;
+
+    callerSPtoSPdelta -= genTotalFrameSize();
+    callerSPtoSPdelta -= REGSIZE_BYTES; // caller-pushed return address
+
+    // compCalleeRegsPushed does not account for the frame pointer
+    // TODO-Cleanup: shouldn't this be part of genTotalFrameSize?
+    if (isFramePointerUsed())
+    {
+        callerSPtoSPdelta -= REGSIZE_BYTES;
+    }
+
+    assert(callerSPtoSPdelta <= 0);
+    return callerSPtoSPdelta;
+}
+#endif // _TARGET_AMD64_
+
+//-----------------------------------------------------------------------------------------
+// genSSE2BitwiseOp - generate SSE2 code for the given oper as "Operand BitWiseOp BitMask"
+//
+// Arguments:
+//    treeNode  - tree node
+//
+// Return value:
+//    None
+//
+// Assumptions:
+//     i) tree oper is one of GT_NEG or GT_INTRINSIC Abs()
+//    ii) tree type is floating point type.
+//   iii) caller of this routine needs to call genProduceReg()
+void CodeGen::genSSE2BitwiseOp(GenTreePtr treeNode)
+{
+    regNumber targetReg  = treeNode->gtRegNum;
+    var_types targetType = treeNode->TypeGet();
+    assert(varTypeIsFloating(targetType));
+
+    float       f;
+    double      d;
+    GenTreePtr* bitMask  = nullptr;
+    instruction ins      = INS_invalid;
+    void*       cnsAddr  = nullptr;
+    bool        dblAlign = false;
+
+    switch (treeNode->OperGet())
+    {
+        case GT_NEG:
+            // Neg(x) = flip the sign bit.
+            // Neg(f) = f ^ 0x80000000
+            // Neg(d) = d ^ 0x8000000000000000
+            ins = genGetInsForOper(GT_XOR, targetType);
+            if (targetType == TYP_FLOAT)
+            {
+                bitMask = &negBitmaskFlt;
+
+                static_assert_no_msg(sizeof(float) == sizeof(int));
+                *((int*)&f) = 0x80000000;
+                cnsAddr     = &f;
+            }
+            else
+            {
+                bitMask = &negBitmaskDbl;
+
+                static_assert_no_msg(sizeof(double) == sizeof(__int64));
+                *((__int64*)&d) = 0x8000000000000000LL;
+                cnsAddr         = &d;
+                dblAlign        = true;
+            }
+            break;
+
+        case GT_INTRINSIC:
+            assert(treeNode->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs);
+
+            // Abs(x) = set sign-bit to zero
+            // Abs(f) = f & 0x7fffffff
+            // Abs(d) = d & 0x7fffffffffffffff
+            ins = genGetInsForOper(GT_AND, targetType);
+            if (targetType == TYP_FLOAT)
+            {
+                bitMask = &absBitmaskFlt;
+
+                static_assert_no_msg(sizeof(float) == sizeof(int));
+                *((int*)&f) = 0x7fffffff;
+                cnsAddr     = &f;
+            }
+            else
+            {
+                bitMask = &absBitmaskDbl;
+
+                static_assert_no_msg(sizeof(double) == sizeof(__int64));
+                *((__int64*)&d) = 0x7fffffffffffffffLL;
+                cnsAddr         = &d;
+                dblAlign        = true;
+            }
+            break;
+
+        default:
+            assert(!"genSSE2: unsupported oper");
+            unreached();
+            break;
+    }
+
+    if (*bitMask == nullptr)
+    {
+        assert(cnsAddr != nullptr);
+        *bitMask = genMakeConst(cnsAddr, targetType, treeNode, dblAlign);
+    }
+
+    // We need an additional register for bitmask.
+    // Make sure we have one allocated.
+    assert(treeNode->gtRsvdRegs != RBM_NONE);
+    assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+    regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+
+    // Move operand into targetReg only if the reg reserved for
+    // internal purpose is not the same as targetReg.
+    GenTreePtr op1 = treeNode->gtOp.gtOp1;
+    assert(!op1->isContained());
+    regNumber operandReg = genConsumeReg(op1);
+    if (tmpReg != targetReg)
+    {
+        if (operandReg != targetReg)
+        {
+            inst_RV_RV(ins_Copy(targetType), targetReg, operandReg, targetType);
+        }
+
+        operandReg = tmpReg;
+    }
+
+    inst_RV_TT(ins_Load(targetType, false), tmpReg, *bitMask);
+    assert(ins != INS_invalid);
+    inst_RV_RV(ins, targetReg, operandReg, targetType);
+}
+
+//---------------------------------------------------------------------
+// genIntrinsic - generate code for a given intrinsic
+//
+// Arguments
+//    treeNode - the GT_INTRINSIC node
+//
+// Return value:
+//    None
+//
+void CodeGen::genIntrinsic(GenTreePtr treeNode)
+{
+    // Right now only Sqrt/Abs are treated as math intrinsics.
+    switch (treeNode->gtIntrinsic.gtIntrinsicId)
+    {
+        case CORINFO_INTRINSIC_Sqrt:
+            noway_assert(treeNode->TypeGet() == TYP_DOUBLE);
+            genConsumeOperands(treeNode->AsOp());
+            getEmitter()->emitInsBinary(ins_FloatSqrt(treeNode->TypeGet()), emitTypeSize(treeNode), treeNode,
+                                        treeNode->gtOp.gtOp1);
+            break;
+
+        case CORINFO_INTRINSIC_Abs:
+            genSSE2BitwiseOp(treeNode);
+            break;
+
+        default:
+            assert(!"genIntrinsic: Unsupported intrinsic");
+            unreached();
+    }
+
+    genProduceReg(treeNode);
+}
+
+//-------------------------------------------------------------------------- //
+// getBaseVarForPutArgStk - returns the baseVarNum for passing a stack arg.
+//
+// Arguments
+//    treeNode - the GT_PUTARG_STK node
+//
+// Return value:
+//    The number of the base variable.
+//
+// Note:
+//    If tail call the outgoing args are placed in the caller's incoming arg stack space.
+//    Otherwise, they go in the outgoing arg area on the current frame.
+//
+//    On Windows the caller always creates slots (homing space) in its frame for the
+//    first 4 arguments of a callee (register passed args). So, the baseVarNum is always 0.
+//    For System V systems there is no such calling convention requirement, and the code needs to find
+//    the first stack passed argument from the caller. This is done by iterating over
+//    all the lvParam variables and finding the first with lvArgReg equals to REG_STK.
+//
+unsigned CodeGen::getBaseVarForPutArgStk(GenTreePtr treeNode)
+{
+    assert(treeNode->OperGet() == GT_PUTARG_STK);
+
+    unsigned baseVarNum;
+
+#if FEATURE_FASTTAILCALL
+    bool putInIncomingArgArea = treeNode->AsPutArgStk()->putInIncomingArgArea;
+#else
+    const bool putInIncomingArgArea = false;
+#endif
+
+    // Whether to setup stk arg in incoming or out-going arg area?
+    // Fast tail calls implemented as epilog+jmp = stk arg is setup in incoming arg area.
+    // All other calls - stk arg is setup in out-going arg area.
+    if (putInIncomingArgArea)
+    {
+        // See the note in the function header re: finding the first stack passed argument.
+        baseVarNum = getFirstArgWithStackSlot();
+        assert(baseVarNum != BAD_VAR_NUM);
+
+#ifdef DEBUG
+        // This must be a fast tail call.
+        assert(treeNode->AsPutArgStk()->gtCall->AsCall()->IsFastTailCall());
+
+        // Since it is a fast tail call, the existence of first incoming arg is guaranteed
+        // because fast tail call requires that in-coming arg area of caller is >= out-going
+        // arg area required for tail call.
+        LclVarDsc* varDsc = &(compiler->lvaTable[baseVarNum]);
+        assert(varDsc != nullptr);
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        assert(!varDsc->lvIsRegArg && varDsc->lvArgReg == REG_STK);
+#else  // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // On Windows this assert is always true. The first argument will always be in REG_ARG_0 or REG_FLTARG_0.
+        assert(varDsc->lvIsRegArg && (varDsc->lvArgReg == REG_ARG_0 || varDsc->lvArgReg == REG_FLTARG_0));
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+#endif // !DEBUG
+    }
+    else
+    {
+#if FEATURE_FIXED_OUT_ARGS
+        baseVarNum = compiler->lvaOutgoingArgSpaceVar;
+#else  // !FEATURE_FIXED_OUT_ARGS
+        NYI_X86("Stack args for x86/RyuJIT");
+        baseVarNum = BAD_VAR_NUM;
+#endif // !FEATURE_FIXED_OUT_ARGS
+    }
+
+    return baseVarNum;
+}
+
+//--------------------------------------------------------------------- //
+// genPutStructArgStk - generate code for passing an arg on the stack.
+//
+// Arguments
+//    treeNode      - the GT_PUTARG_STK node
+//    targetType    - the type of the treeNode
+//
+// Return value:
+//    None
+//
+void CodeGen::genPutArgStk(GenTreePtr treeNode)
+{
+    var_types targetType = treeNode->TypeGet();
+#ifdef _TARGET_X86_
+    noway_assert(targetType != TYP_STRUCT);
+
+    // The following logic is applicable for x86 arch.
+    assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+
+    GenTreePtr data = treeNode->gtOp.gtOp1;
+
+    // On a 32-bit target, all of the long arguments have been decomposed into
+    // a separate putarg_stk for each of the upper and lower halves.
+    noway_assert(targetType != TYP_LONG);
+
+    int argSize = genTypeSize(genActualType(targetType));
+    genStackLevel += argSize;
+
+    // TODO-Cleanup: Handle this in emitInsMov() in emitXArch.cpp?
+    if (data->isContainedIntOrIImmed())
+    {
+        if (data->IsIconHandle())
+        {
+            inst_IV_handle(INS_push, data->gtIntCon.gtIconVal);
+        }
+        else
+        {
+            inst_IV(INS_push, data->gtIntCon.gtIconVal);
+        }
+    }
+    else if (data->isContained())
+    {
+        NYI_X86("Contained putarg_stk of non-constant");
+    }
+    else
+    {
+        genConsumeReg(data);
+        if (varTypeIsIntegralOrI(targetType))
+        {
+            inst_RV(INS_push, data->gtRegNum, targetType);
+        }
+        else
+        {
+            // Decrement SP.
+            inst_RV_IV(INS_sub, REG_SPBASE, argSize, emitActualTypeSize(TYP_I_IMPL));
+            getEmitter()->emitIns_AR_R(ins_Store(targetType), emitTypeSize(targetType), data->gtRegNum, REG_SPBASE, 0);
+        }
+    }
+#else // !_TARGET_X86_
+    {
+        unsigned baseVarNum = getBaseVarForPutArgStk(treeNode);
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+        if (varTypeIsStruct(targetType))
+        {
+            genPutStructArgStk(treeNode, baseVarNum);
+            return;
+        }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+        noway_assert(targetType != TYP_STRUCT);
+        assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+
+        // Get argument offset on stack.
+        // Here we cross check that argument offset hasn't changed from lowering to codegen since
+        // we are storing arg slot number in GT_PUTARG_STK node in lowering phase.
+        int              argOffset      = treeNode->AsPutArgStk()->getArgOffset();
+
+#ifdef DEBUG
+        fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(treeNode->AsPutArgStk()->gtCall, treeNode);
+        assert(curArgTabEntry);
+        assert(argOffset == (int)curArgTabEntry->slotNum * TARGET_POINTER_SIZE);
+#endif
+
+        GenTreePtr data = treeNode->gtGetOp1();
+
+        if (data->isContained())
+        {
+            getEmitter()->emitIns_S_I(ins_Store(targetType), emitTypeSize(targetType), baseVarNum, argOffset,
+                                      (int)data->AsIntConCommon()->IconValue());
+        }
+        else
+        {
+            genConsumeReg(data);
+            getEmitter()->emitIns_S_R(ins_Store(targetType), emitTypeSize(targetType), data->gtRegNum, baseVarNum,
+                                      argOffset);
+        }
+    }
+#endif // !_TARGET_X86_
+}
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+//---------------------------------------------------------------------
+// genPutStructArgStk - generate code for copying a struct arg on the stack by value.
+//                In case there are references to heap object in the struct,
+//                it generates the gcinfo as well.
+//
+// Arguments
+//    treeNode      - the GT_PUTARG_STK node
+//    baseVarNum    - the variable number relative to which to put the argument on the stack.
+//                    For tail calls this is the baseVarNum = 0.
+//                    For non tail calls this is the outgoingArgSpace.
+//
+// Return value:
+//    None
+//
+void CodeGen::genPutStructArgStk(GenTreePtr treeNode, unsigned baseVarNum)
+{
+    assert(treeNode->OperGet() == GT_PUTARG_STK);
+    assert(baseVarNum != BAD_VAR_NUM);
+
+    var_types targetType = treeNode->TypeGet();
+
+    if (varTypeIsSIMD(targetType))
+    {
+        regNumber srcReg = genConsumeReg(treeNode->gtGetOp1());
+        assert((srcReg != REG_NA) && (genIsValidFloatReg(srcReg)));
+        getEmitter()->emitIns_S_R(ins_Store(targetType), emitTypeSize(targetType), srcReg, baseVarNum,
+                                  treeNode->AsPutArgStk()->getArgOffset());
+        return;
+    }
+
+    assert(targetType == TYP_STRUCT);
+
+    GenTreePutArgStk* putArgStk = treeNode->AsPutArgStk();
+    if (putArgStk->gtNumberReferenceSlots == 0)
+    {
+        switch (putArgStk->gtPutArgStkKind)
+        {
+            case GenTreePutArgStk::PutArgStkKindRepInstr:
+                genStructPutArgRepMovs(putArgStk, baseVarNum);
+                break;
+            case GenTreePutArgStk::PutArgStkKindUnroll:
+                genStructPutArgUnroll(putArgStk, baseVarNum);
+                break;
+            default:
+                unreached();
+        }
+    }
+    else
+    {
+        // No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always.
+
+        // Consume these registers.
+        // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
+        genConsumePutStructArgStk(putArgStk, REG_RDI, REG_RSI, REG_NA, baseVarNum);
+        GenTreePtr dstAddr = putArgStk;
+        GenTreePtr src     = putArgStk->gtOp.gtOp1;
+        assert(src->OperGet() == GT_OBJ);
+        GenTreePtr srcAddr = src->gtGetOp1();
+
+        unsigned slots = putArgStk->gtNumSlots;
+
+        // We are always on the stack we don't need to use the write barrier.
+        BYTE*    gcPtrs     = putArgStk->gtGcPtrs;
+        unsigned gcPtrCount = putArgStk->gtNumberReferenceSlots;
+
+        unsigned i           = 0;
+        unsigned copiedSlots = 0;
+        while (i < slots)
+        {
+            switch (gcPtrs[i])
+            {
+                case TYPE_GC_NONE:
+                    // Let's see if we can use rep movsq instead of a sequence of movsq instructions
+                    // to save cycles and code size.
+                    {
+                        unsigned nonGcSlotCount = 0;
+
+                        do
+                        {
+                            nonGcSlotCount++;
+                            i++;
+                        } while (i < slots && gcPtrs[i] == TYPE_GC_NONE);
+
+                        // If we have a very small contiguous non-gc region, it's better just to
+                        // emit a sequence of movsq instructions
+                        if (nonGcSlotCount < CPOBJ_NONGC_SLOTS_LIMIT)
+                        {
+                            copiedSlots += nonGcSlotCount;
+                            while (nonGcSlotCount > 0)
+                            {
+                                instGen(INS_movsq);
+                                nonGcSlotCount--;
+                            }
+                        }
+                        else
+                        {
+                            getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, nonGcSlotCount);
+                            copiedSlots += nonGcSlotCount;
+                            instGen(INS_r_movsq);
+                        }
+                    }
+                    break;
+
+                case TYPE_GC_REF:   // Is an object ref
+                case TYPE_GC_BYREF: // Is an interior pointer - promote it but don't scan it
+                {
+                    // We have a GC (byref or ref) pointer
+                    // TODO-Amd64-Unix: Here a better solution (for code size and CQ) would be to use movsq instruction,
+                    // but the logic for emitting a GC info record is not available (it is internal for the emitter
+                    // only.) See emitGCVarLiveUpd function. If we could call it separately, we could do
+                    // instGen(INS_movsq); and emission of gc info.
+
+                    var_types memType;
+                    if (gcPtrs[i] == TYPE_GC_REF)
+                    {
+                        memType = TYP_REF;
+                    }
+                    else
+                    {
+                        assert(gcPtrs[i] == TYPE_GC_BYREF);
+                        memType = TYP_BYREF;
+                    }
+
+                    getEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), REG_RCX, REG_RSI, 0);
+                    getEmitter()->emitIns_S_R(ins_Store(memType), emitTypeSize(memType), REG_RCX, baseVarNum,
+                                              ((copiedSlots + putArgStk->gtSlotNum) * TARGET_POINTER_SIZE));
+
+                    // Source for the copy operation.
+                    // If a LocalAddr, use EA_PTRSIZE - copy from stack.
+                    // If not a LocalAddr, use EA_BYREF - the source location is not on the stack.
+                    getEmitter()->emitIns_R_I(INS_add, ((src->OperIsLocalAddr()) ? EA_PTRSIZE : EA_BYREF), REG_RSI,
+                                              TARGET_POINTER_SIZE);
+
+                    // Always copying to the stack - outgoing arg area
+                    // (or the outgoing arg area of the caller for a tail call) - use EA_PTRSIZE.
+                    getEmitter()->emitIns_R_I(INS_add, EA_PTRSIZE, REG_RDI, TARGET_POINTER_SIZE);
+                    copiedSlots++;
+                    gcPtrCount--;
+                    i++;
+                }
+                break;
+
+                default:
+                    unreached();
+                    break;
+            }
+        }
+
+        assert(gcPtrCount == 0);
+    }
+}
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+/*****************************************************************************
+ *
+ *  Create and record GC Info for the function.
+ */
+#ifdef _TARGET_AMD64_
+void
+#else  // !_TARGET_AMD64_
+void*
+#endif // !_TARGET_AMD64_
+CodeGen::genCreateAndStoreGCInfo(unsigned codeSize, unsigned prologSize, unsigned epilogSize DEBUGARG(void* codePtr))
+{
+#ifdef JIT32_GCENCODER
+    return genCreateAndStoreGCInfoJIT32(codeSize, prologSize, epilogSize DEBUGARG(codePtr));
+#else  // !JIT32_GCENCODER
+    genCreateAndStoreGCInfoX64(codeSize, prologSize DEBUGARG(codePtr));
+#endif // !JIT32_GCENCODER
+}
+
+#ifdef JIT32_GCENCODER
+void* CodeGen::genCreateAndStoreGCInfoJIT32(unsigned codeSize,
+                                            unsigned prologSize,
+                                            unsigned epilogSize DEBUGARG(void* codePtr))
+{
+    BYTE    headerBuf[64];
+    InfoHdr header;
+
+    int s_cached;
+#ifdef DEBUG
+    size_t headerSize =
+#endif
+        compiler->compInfoBlkSize =
+            gcInfo.gcInfoBlockHdrSave(headerBuf, 0, codeSize, prologSize, epilogSize, &header, &s_cached);
+
+    size_t argTabOffset = 0;
+    size_t ptrMapSize   = gcInfo.gcPtrTableSize(header, codeSize, &argTabOffset);
+
+#if DISPLAY_SIZES
+
+    if (genInterruptible)
+    {
+        gcHeaderISize += compiler->compInfoBlkSize;
+        gcPtrMapISize += ptrMapSize;
+    }
+    else
+    {
+        gcHeaderNSize += compiler->compInfoBlkSize;
+        gcPtrMapNSize += ptrMapSize;
+    }
+
+#endif // DISPLAY_SIZES
+
+    compiler->compInfoBlkSize += ptrMapSize;
+
+    /* Allocate the info block for the method */
+
+    compiler->compInfoBlkAddr = (BYTE*)compiler->info.compCompHnd->allocGCInfo(compiler->compInfoBlkSize);
+
+#if 0 // VERBOSE_SIZES
+    // TODO-X86-Cleanup: 'dataSize', below, is not defined
+
+//  if  (compiler->compInfoBlkSize > codeSize && compiler->compInfoBlkSize > 100)
+    {
+        printf("[%7u VM, %7u+%7u/%7u x86 %03u/%03u%%] %s.%s\n",
+               compiler->info.compILCodeSize,
+               compiler->compInfoBlkSize,
+               codeSize + dataSize,
+               codeSize + dataSize - prologSize - epilogSize,
+               100 * (codeSize + dataSize) / compiler->info.compILCodeSize,
+               100 * (codeSize + dataSize + compiler->compInfoBlkSize) / compiler->info.compILCodeSize,
+               compiler->info.compClassName,
+               compiler->info.compMethodName);
+}
+
+#endif
+
+    /* Fill in the info block and return it to the caller */
+
+    void* infoPtr = compiler->compInfoBlkAddr;
+
+    /* Create the method info block: header followed by GC tracking tables */
+
+    compiler->compInfoBlkAddr +=
+        gcInfo.gcInfoBlockHdrSave(compiler->compInfoBlkAddr, -1, codeSize, prologSize, epilogSize, &header, &s_cached);
+
+    assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize);
+    compiler->compInfoBlkAddr = gcInfo.gcPtrTableSave(compiler->compInfoBlkAddr, header, codeSize, &argTabOffset);
+    assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize + ptrMapSize);
+
+#ifdef DEBUG
+
+    if (0)
+    {
+        BYTE*    temp = (BYTE*)infoPtr;
+        unsigned size = compiler->compInfoBlkAddr - temp;
+        BYTE*    ptab = temp + headerSize;
+
+        noway_assert(size == headerSize + ptrMapSize);
+
+        printf("Method info block - header [%u bytes]:", headerSize);
+
+        for (unsigned i = 0; i < size; i++)
+        {
+            if (temp == ptab)
+            {
+                printf("\nMethod info block - ptrtab [%u bytes]:", ptrMapSize);
+                printf("\n    %04X: %*c", i & ~0xF, 3 * (i & 0xF), ' ');
+            }
+            else
+            {
+                if (!(i % 16))
+                    printf("\n    %04X: ", i);
+            }
+
+            printf("%02X ", *temp++);
+        }
+
+        printf("\n");
+    }
+
+#endif // DEBUG
+
+#if DUMP_GC_TABLES
+
+    if (compiler->opts.dspGCtbls)
+    {
+        const BYTE* base = (BYTE*)infoPtr;
+        unsigned    size;
+        unsigned    methodSize;
+        InfoHdr     dumpHeader;
+
+        printf("GC Info for method %s\n", compiler->info.compFullName);
+        printf("GC info size = %3u\n", compiler->compInfoBlkSize);
+
+        size = gcInfo.gcInfoBlockHdrDump(base, &dumpHeader, &methodSize);
+        // printf("size of header encoding is %3u\n", size);
+        printf("\n");
+
+        if (compiler->opts.dspGCtbls)
+        {
+            base += size;
+            size = gcInfo.gcDumpPtrTable(base, dumpHeader, methodSize);
+            // printf("size of pointer table is %3u\n", size);
+            printf("\n");
+            noway_assert(compiler->compInfoBlkAddr == (base + size));
+        }
+    }
+
+#ifdef DEBUG
+    if (jitOpts.testMask & 128)
+    {
+        for (unsigned offs = 0; offs < codeSize; offs++)
+        {
+            gcInfo.gcFindPtrsInFrame(infoPtr, codePtr, offs);
+        }
+    }
+#endif // DEBUG
+#endif // DUMP_GC_TABLES
+
+    /* Make sure we ended up generating the expected number of bytes */
+
+    noway_assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + compiler->compInfoBlkSize);
+
+    return infoPtr;
+}
+
+#else // !JIT32_GCENCODER
+void CodeGen::genCreateAndStoreGCInfoX64(unsigned codeSize, unsigned prologSize DEBUGARG(void* codePtr))
+{
+    IAllocator*    allowZeroAlloc = new (compiler, CMK_GC) AllowZeroAllocator(compiler->getAllocatorGC());
+    GcInfoEncoder* gcInfoEncoder  = new (compiler, CMK_GC)
+        GcInfoEncoder(compiler->info.compCompHnd, compiler->info.compMethodInfo, allowZeroAlloc, NOMEM);
+    assert(gcInfoEncoder);
+
+    // Follow the code pattern of the x86 gc info encoder (genCreateAndStoreGCInfoJIT32).
+    gcInfo.gcInfoBlockHdrSave(gcInfoEncoder, codeSize, prologSize);
+
+    // First we figure out the encoder ID's for the stack slots and registers.
+    gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_ASSIGN_SLOTS);
+    // Now we've requested all the slots we'll need; "finalize" these (make more compact data structures for them).
+    gcInfoEncoder->FinalizeSlotIds();
+    // Now we can actually use those slot ID's to declare live ranges.
+    gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_DO_WORK);
+
+#if defined(DEBUGGING_SUPPORT)
+    if (compiler->opts.compDbgEnC)
+    {
+        // what we have to preserve is called the "frame header" (see comments in VM\eetwain.cpp)
+        // which is:
+        //  -return address
+        //  -saved off RBP
+        //  -saved 'this' pointer and bool for synchronized methods
+
+        // 4 slots for RBP + return address + RSI + RDI
+        int preservedAreaSize = 4 * REGSIZE_BYTES;
+
+        if (compiler->info.compFlags & CORINFO_FLG_SYNCH)
+        {
+            if (!(compiler->info.compFlags & CORINFO_FLG_STATIC))
+            {
+                preservedAreaSize += REGSIZE_BYTES;
+            }
+
+            // bool in synchronized methods that tracks whether the lock has been taken (takes 4 bytes on stack)
+            preservedAreaSize += 4;
+        }
+
+        // Used to signal both that the method is compiled for EnC, and also the size of the block at the top of the
+        // frame
+        gcInfoEncoder->SetSizeOfEditAndContinuePreservedArea(preservedAreaSize);
+    }
+#endif
+
+    gcInfoEncoder->Build();
+
+    // GC Encoder automatically puts the GC info in the right spot using ICorJitInfo::allocGCInfo(size_t)
+    // let's save the values anyway for debugging purposes
+    compiler->compInfoBlkAddr = gcInfoEncoder->Emit();
+    compiler->compInfoBlkSize = 0; // not exposed by the GCEncoder interface
+}
+#endif // !JIT32_GCENCODER
+
+/*****************************************************************************
+ *  Emit a call to a helper function.
+ *
+ */
+
+void CodeGen::genEmitHelperCall(unsigned helper, int argSize, emitAttr retSize, regNumber callTargetReg)
+{
+    void* addr  = nullptr;
+    void* pAddr = nullptr;
+
+    emitter::EmitCallType callType = emitter::EC_FUNC_TOKEN;
+    addr                           = compiler->compGetHelperFtn((CorInfoHelpFunc)helper, &pAddr);
+    regNumber callTarget           = REG_NA;
+    regMaskTP killMask             = compiler->compHelperCallKillSet((CorInfoHelpFunc)helper);
+
+    if (!addr)
+    {
+        assert(pAddr != nullptr);
+
+        // Absolute indirect call addr
+        // Note: Order of checks is important. First always check for pc-relative and next
+        // zero-relative.  Because the former encoding is 1-byte smaller than the latter.
+        if (genCodeIndirAddrCanBeEncodedAsPCRelOffset((size_t)pAddr) ||
+            genCodeIndirAddrCanBeEncodedAsZeroRelOffset((size_t)pAddr))
+        {
+            // generate call whose target is specified by 32-bit offset relative to PC or zero.
+            callType = emitter::EC_FUNC_TOKEN_INDIR;
+            addr     = pAddr;
+        }
+        else
+        {
+#ifdef _TARGET_AMD64_
+            // If this indirect address cannot be encoded as 32-bit offset relative to PC or Zero,
+            // load it into REG_HELPER_CALL_TARGET and use register indirect addressing mode to
+            // make the call.
+            //    mov   reg, addr
+            //    call  [reg]
+
+            if (callTargetReg == REG_NA)
+            {
+                // If a callTargetReg has not been explicitly provided, we will use REG_DEFAULT_HELPER_CALL_TARGET, but
+                // this is only a valid assumption if the helper call is known to kill REG_DEFAULT_HELPER_CALL_TARGET.
+                callTargetReg            = REG_DEFAULT_HELPER_CALL_TARGET;
+                regMaskTP callTargetMask = genRegMask(callTargetReg);
+                noway_assert((callTargetMask & killMask) == callTargetMask);
+            }
+            else
+            {
+                // The call target must not overwrite any live variable, though it may not be in the
+                // kill set for the call.
+                regMaskTP callTargetMask = genRegMask(callTargetReg);
+                noway_assert((callTargetMask & regSet.rsMaskVars) == RBM_NONE);
+            }
+#endif
+
+            callTarget = callTargetReg;
+            CodeGen::genSetRegToIcon(callTarget, (ssize_t)pAddr, TYP_I_IMPL);
+            callType = emitter::EC_INDIR_ARD;
+        }
+    }
+
+    getEmitter()->emitIns_Call(callType, compiler->eeFindHelper(helper), INDEBUG_LDISASM_COMMA(nullptr) addr, argSize,
+                               retSize FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(EA_UNKNOWN), gcInfo.gcVarPtrSetCur,
+                               gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur,
+                               BAD_IL_OFFSET, // IL offset
+                               callTarget,    // ireg
+                               REG_NA, 0, 0,  // xreg, xmul, disp
+                               false,         // isJump
+                               emitter::emitNoGChelper(helper));
+
+    regTracker.rsTrashRegSet(killMask);
+    regTracker.rsTrashRegsForGCInterruptability();
+}
+
+#if !defined(_TARGET_64BIT_)
+//-----------------------------------------------------------------------------
+//
+// Code Generation for Long integers
+//
+//-----------------------------------------------------------------------------
+
+//------------------------------------------------------------------------
+// genStoreLongLclVar: Generate code to store a non-enregistered long lclVar
+//
+// Arguments:
+//    treeNode - A TYP_LONG lclVar node.
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    'treeNode' must be a TYP_LONG lclVar node for a lclVar that has NOT been promoted.
+//    Its operand must be a GT_LONG node.
+//
+void CodeGen::genStoreLongLclVar(GenTree* treeNode)
+{
+    emitter* emit = getEmitter();
+
+    GenTreeLclVarCommon* lclNode = treeNode->AsLclVarCommon();
+    unsigned             lclNum  = lclNode->gtLclNum;
+    LclVarDsc*           varDsc  = &(compiler->lvaTable[lclNum]);
+    assert(varDsc->TypeGet() == TYP_LONG);
+    assert(!varDsc->lvPromoted);
+    GenTreePtr op1 = treeNode->gtOp.gtOp1;
+    noway_assert(op1->OperGet() == GT_LONG);
+    genConsumeRegs(op1);
+
+    // Definitions of register candidates will have been lowered to 2 int lclVars.
+    assert(!treeNode->InReg());
+
+    GenTreePtr loVal = op1->gtGetOp1();
+    GenTreePtr hiVal = op1->gtGetOp2();
+    // NYI: Contained immediates.
+    NYI_IF((loVal->gtRegNum == REG_NA) || (hiVal->gtRegNum == REG_NA), "Store of long lclVar with contained immediate");
+    emit->emitIns_R_S(ins_Store(TYP_INT), EA_4BYTE, loVal->gtRegNum, lclNum, 0);
+    emit->emitIns_R_S(ins_Store(TYP_INT), EA_4BYTE, hiVal->gtRegNum, lclNum, genTypeSize(TYP_INT));
+}
+#endif // !defined(_TARGET_64BIT_)
+
+/*****************************************************************************
+* Unit testing of the XArch emitter: generate a bunch of instructions into the prolog
+* (it's as good a place as any), then use COMPlus_JitLateDisasm=* to see if the late
+* disassembler thinks the instructions as the same as we do.
+*/
+
+// Uncomment "#define ALL_ARM64_EMITTER_UNIT_TESTS" to run all the unit tests here.
+// After adding a unit test, and verifying it works, put it under this #ifdef, so we don't see it run every time.
+//#define ALL_XARCH_EMITTER_UNIT_TESTS
+
+#if defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_AMD64_)
+void CodeGen::genAmd64EmitterUnitTests()
+{
+    if (!verbose)
+    {
+        return;
+    }
+
+    if (!compiler->opts.altJit)
+    {
+        // No point doing this in a "real" JIT.
+        return;
+    }
+
+    // Mark the "fake" instructions in the output.
+    printf("*************** In genAmd64EmitterUnitTests()\n");
+
+    // We use this:
+    //      genDefineTempLabel(genCreateTempLabel());
+    // to create artificial labels to help separate groups of tests.
+
+    //
+    // Loads
+    //
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef ALL_XARCH_EMITTER_UNIT_TESTS
+#ifdef FEATURE_AVX_SUPPORT
+    genDefineTempLabel(genCreateTempLabel());
+
+    // vhaddpd     ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_haddpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddss      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_addss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddsd      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_addsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddps      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_addps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddps      ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_addps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddpd      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_addpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddpd      ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_addpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubss      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_subss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubsd      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_subsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubps      ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_subps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubps      ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_subps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubpd      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_subpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubpd      ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_subpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulss      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_mulss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulsd      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_mulsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulps      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_mulps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulpd      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_mulpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulps      ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_mulps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulpd      ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_mulpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vandps      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_andps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vandpd      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_andpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vandps      ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_andps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vandpd      ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_andpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vorps      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_orps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vorpd      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_orpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vorps      ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_orps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vorpd      ymm0,ymm1,ymm2
+    getEmitter()->emitIns_R_R_R(INS_orpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vdivss      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vdivsd      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vdivss      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vdivsd      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+
+    // vdivss      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_cvtss2sd, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vdivsd      xmm0,xmm1,xmm2
+    getEmitter()->emitIns_R_R_R(INS_cvtsd2ss, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+#endif // FEATURE_AVX_SUPPORT
+#endif // ALL_XARCH_EMITTER_UNIT_TESTS
+    printf("*************** End of genAmd64EmitterUnitTests()\n");
+}
+
+#endif // defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_AMD64_)
+
+/*****************************************************************************/
+#ifdef DEBUGGING_SUPPORT
+/*****************************************************************************
+ *                          genSetScopeInfo
+ *
+ * Called for every scope info piece to record by the main genSetScopeInfo()
+ */
+
+void CodeGen::genSetScopeInfo(unsigned            which,
+                              UNATIVE_OFFSET      startOffs,
+                              UNATIVE_OFFSET      length,
+                              unsigned            varNum,
+                              unsigned            LVnum,
+                              bool                avail,
+                              Compiler::siVarLoc& varLoc)
+{
+    /* We need to do some mapping while reporting back these variables */
+
+    unsigned ilVarNum = compiler->compMap2ILvarNum(varNum);
+    noway_assert((int)ilVarNum != ICorDebugInfo::UNKNOWN_ILNUM);
+
+    VarName name = nullptr;
+
+#ifdef DEBUG
+
+    for (unsigned scopeNum = 0; scopeNum < compiler->info.compVarScopesCount; scopeNum++)
+    {
+        if (LVnum == compiler->info.compVarScopes[scopeNum].vsdLVnum)
+        {
+            name = compiler->info.compVarScopes[scopeNum].vsdName;
+        }
+    }
+
+    // Hang on to this compiler->info.
+
+    TrnslLocalVarInfo& tlvi = genTrnslLocalVarInfo[which];
+
+    tlvi.tlviVarNum    = ilVarNum;
+    tlvi.tlviLVnum     = LVnum;
+    tlvi.tlviName      = name;
+    tlvi.tlviStartPC   = startOffs;
+    tlvi.tlviLength    = length;
+    tlvi.tlviAvailable = avail;
+    tlvi.tlviVarLoc    = varLoc;
+
+#endif // DEBUG
+
+    compiler->eeSetLVinfo(which, startOffs, length, ilVarNum, LVnum, name, avail, varLoc);
+}
+#endif // DEBUGGING_SUPPORT
+
+#endif // _TARGET_AMD64_
+
+#endif // !LEGACY_BACKEND