1 files changed, 11779 insertions, 0 deletions
diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
new file mode 100755
index 0000000000..2710447ade
--- /dev/null
+++ b/src/jit/codegencommon.cpp
@@ -0,0 +1,11779 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX Code Generator Common:                                                    XX
+XX   Methods common to all architectures and register allocation strategies  XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+// TODO-Cleanup: There are additional methods in CodeGen*.cpp that are almost
+// identical, and which should probably be moved here.
+
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+#include "codegen.h"
+
+#include "gcinfo.h"
+#include "emit.h"
+
+#ifndef JIT32_GCENCODER
+#include "gcinfoencoder.h"
+#endif
+
+/*****************************************************************************/
+
+const BYTE genTypeSizes[] = {
+#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) sz,
+#include "typelist.h"
+#undef DEF_TP
+};
+
+const BYTE genTypeAlignments[] = {
+#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) al,
+#include "typelist.h"
+#undef DEF_TP
+};
+
+const BYTE genTypeStSzs[] = {
+#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) st,
+#include "typelist.h"
+#undef DEF_TP
+};
+
+const BYTE genActualTypes[] = {
+#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) jitType,
+#include "typelist.h"
+#undef DEF_TP
+};
+
+void CodeGenInterface::setFramePointerRequiredEH(bool value)
+{
+    m_cgFramePointerRequired = value;
+
+#ifndef JIT32_GCENCODER
+    if (value)
+    {
+        // EnumGcRefs will only enumerate slots in aborted frames
+        // if they are fully-interruptible.  So if we have a catch
+        // or finally that will keep frame-vars alive, we need to
+        // force fully-interruptible.
+        CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef DEBUG
+        if (verbose)
+        {
+            printf("Method has EH, marking method as fully interruptible\n");
+        }
+#endif
+
+        m_cgInterruptible = true;
+    }
+#endif // JIT32_GCENCODER
+}
+
+/*****************************************************************************/
+CodeGenInterface* getCodeGenerator(Compiler* comp)
+{
+    return new (comp, CMK_Codegen) CodeGen(comp);
+}
+
+// CodeGen constructor
+CodeGenInterface::CodeGenInterface(Compiler* theCompiler)
+    : gcInfo(theCompiler), regSet(theCompiler, gcInfo), compiler(theCompiler)
+{
+}
+
+/*****************************************************************************/
+
+CodeGen::CodeGen(Compiler* theCompiler) : CodeGenInterface(theCompiler)
+{
+#if defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+    negBitmaskFlt  = nullptr;
+    negBitmaskDbl  = nullptr;
+    absBitmaskFlt  = nullptr;
+    absBitmaskDbl  = nullptr;
+    u8ToDblBitmask = nullptr;
+#endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+
+    regTracker.rsTrackInit(compiler, &regSet);
+    gcInfo.regSet        = &regSet;
+    m_cgEmitter          = new (compiler->getAllocator()) emitter();
+    m_cgEmitter->codeGen = this;
+    m_cgEmitter->gcInfo  = &gcInfo;
+
+#ifdef DEBUG
+    setVerbose(compiler->verbose);
+#endif // DEBUG
+
+    compiler->tmpInit();
+
+#ifdef DEBUG
+#if defined(_TARGET_X86_) && defined(LEGACY_BACKEND)
+    // This appears to be x86-specific. It's attempting to make sure all offsets to temps
+    // are large. For ARM, this doesn't interact well with our decision about whether to use
+    // R10 or not as a reserved register.
+    if (regSet.rsStressRegs())
+        compiler->tmpIntSpillMax = (SCHAR_MAX / sizeof(int));
+#endif // defined(_TARGET_X86_) && defined(LEGACY_BACKEND)
+#endif // DEBUG
+
+    instInit();
+
+#ifdef LEGACY_BACKEND
+    // TODO-Cleanup: These used to be set in rsInit() - should they be moved to RegSet??
+    // They are also accessed by the register allocators and fgMorphLclVar().
+    intRegState.rsCurRegArgNum   = 0;
+    floatRegState.rsCurRegArgNum = 0;
+#endif // LEGACY_BACKEND
+
+#ifdef LATE_DISASM
+    getDisAssembler().disInit(compiler);
+#endif
+
+#ifdef DEBUG
+    genTempLiveChg        = true;
+    genTrnslLocalVarCount = 0;
+
+    // Shouldn't be used before it is set in genFnProlog()
+    compiler->compCalleeRegsPushed = UninitializedWord<unsigned>();
+
+#if defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+    // Shouldn't be used before it is set in genFnProlog()
+    compiler->compCalleeFPRegsSavedMask = (regMaskTP)-1;
+#endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+#endif // DEBUG
+
+#ifdef _TARGET_AMD64_
+    // This will be set before final frame layout.
+    compiler->compVSQuirkStackPaddingNeeded = 0;
+
+    // Set to true if we perform the Quirk that fixes the PPP issue
+    compiler->compQuirkForPPPflag = false;
+#endif // _TARGET_AMD64_
+
+#ifdef LEGACY_BACKEND
+    genFlagsEqualToNone();
+#endif // LEGACY_BACKEND
+
+#ifdef DEBUGGING_SUPPORT
+    //  Initialize the IP-mapping logic.
+    compiler->genIPmappingList        = nullptr;
+    compiler->genIPmappingLast        = nullptr;
+    compiler->genCallSite2ILOffsetMap = nullptr;
+#endif
+
+    /* Assume that we not fully interruptible */
+
+    genInterruptible = false;
+#ifdef DEBUG
+    genInterruptibleUsed = false;
+    genCurDispOffset     = (unsigned)-1;
+#endif
+}
+
+void CodeGenInterface::genMarkTreeInReg(GenTreePtr tree, regNumber reg)
+{
+    tree->gtRegNum = reg;
+    tree->gtFlags |= GTF_REG_VAL;
+}
+
+#if CPU_LONG_USES_REGPAIR
+void CodeGenInterface::genMarkTreeInRegPair(GenTreePtr tree, regPairNo regPair)
+{
+    tree->gtRegPair = regPair;
+    tree->gtFlags |= GTF_REG_VAL;
+}
+#endif
+
+#if defined(_TARGET_X86_) || defined(_TARGET_ARM_)
+
+//---------------------------------------------------------------------
+// genTotalFrameSize - return the "total" size of the stack frame, including local size
+// and callee-saved register size. There are a few things "missing" depending on the
+// platform. The function genCallerSPtoInitialSPdelta() includes those things.
+//
+// For ARM, this doesn't include the prespilled registers.
+//
+// For x86, this doesn't include the frame pointer if codeGen->isFramePointerUsed() is true.
+// It also doesn't include the pushed return address.
+//
+// Return value:
+//    Frame size
+
+int CodeGenInterface::genTotalFrameSize()
+{
+    assert(!IsUninitialized(compiler->compCalleeRegsPushed));
+
+    int totalFrameSize = compiler->compCalleeRegsPushed * REGSIZE_BYTES + compiler->compLclFrameSize;
+
+    assert(totalFrameSize >= 0);
+    return totalFrameSize;
+}
+
+//---------------------------------------------------------------------
+// genSPtoFPdelta - return the offset from SP to the frame pointer.
+// This number is going to be positive, since SP must be at the lowest
+// address.
+//
+// There must be a frame pointer to call this function!
+
+int CodeGenInterface::genSPtoFPdelta()
+{
+    assert(isFramePointerUsed());
+
+    int delta;
+
+    delta = -genCallerSPtoInitialSPdelta() + genCallerSPtoFPdelta();
+
+    assert(delta >= 0);
+    return delta;
+}
+
+//---------------------------------------------------------------------
+// genCallerSPtoFPdelta - return the offset from Caller-SP to the frame pointer.
+// This number is going to be negative, since the Caller-SP is at a higher
+// address than the frame pointer.
+//
+// There must be a frame pointer to call this function!
+
+int CodeGenInterface::genCallerSPtoFPdelta()
+{
+    assert(isFramePointerUsed());
+    int callerSPtoFPdelta = 0;
+
+#if defined(_TARGET_ARM_)
+    // On ARM, we first push the prespill registers, then store LR, then R11 (FP), and point R11 at the saved R11.
+    callerSPtoFPdelta -= genCountBits(regSet.rsMaskPreSpillRegs(true)) * REGSIZE_BYTES;
+    callerSPtoFPdelta -= 2 * REGSIZE_BYTES;
+#elif defined(_TARGET_X86_)
+    // Thanks to ebp chaining, the difference between ebp-based addresses
+    // and caller-SP-relative addresses is just the 2 pointers:
+    //     return address
+    //     pushed ebp
+    callerSPtoFPdelta -= 2 * REGSIZE_BYTES;
+#else
+#error "Unknown _TARGET_"
+#endif // _TARGET_*
+
+    assert(callerSPtoFPdelta <= 0);
+    return callerSPtoFPdelta;
+}
+
+//---------------------------------------------------------------------
+// genCallerSPtoInitialSPdelta - return the offset from Caller-SP to Initial SP.
+//
+// This number will be negative.
+
+int CodeGenInterface::genCallerSPtoInitialSPdelta()
+{
+    int callerSPtoSPdelta = 0;
+
+#if defined(_TARGET_ARM_)
+    callerSPtoSPdelta -= genCountBits(regSet.rsMaskPreSpillRegs(true)) * REGSIZE_BYTES;
+    callerSPtoSPdelta -= genTotalFrameSize();
+#elif defined(_TARGET_X86_)
+    callerSPtoSPdelta -= genTotalFrameSize();
+    callerSPtoSPdelta -= REGSIZE_BYTES; // caller-pushed return address
+
+    // compCalleeRegsPushed does not account for the frame pointer
+    // TODO-Cleanup: shouldn't this be part of genTotalFrameSize?
+    if (isFramePointerUsed())
+    {
+        callerSPtoSPdelta -= REGSIZE_BYTES;
+    }
+#else
+#error "Unknown _TARGET_"
+#endif // _TARGET_*
+
+    assert(callerSPtoSPdelta <= 0);
+    return callerSPtoSPdelta;
+}
+
+#endif // defined(_TARGET_X86_) || defined(_TARGET_ARM_)
+
+/*****************************************************************************
+ * Should we round simple operations (assignments, arithmetic operations, etc.)
+ */
+
+// inline
+// static
+bool CodeGen::genShouldRoundFP()
+{
+    RoundLevel roundLevel = getRoundFloatLevel();
+
+    switch (roundLevel)
+    {
+        case ROUND_NEVER:
+        case ROUND_CMP_CONST:
+        case ROUND_CMP:
+            return false;
+
+        default:
+            assert(roundLevel == ROUND_ALWAYS);
+            return true;
+    }
+}
+
+/*****************************************************************************
+ *
+ *  Initialize some global variables.
+ */
+
+void CodeGen::genPrepForCompiler()
+{
+    unsigned   varNum;
+    LclVarDsc* varDsc;
+
+    /* Figure out which non-register variables hold pointers */
+
+    VarSetOps::AssignNoCopy(compiler, gcInfo.gcTrkStkPtrLcls, VarSetOps::MakeEmpty(compiler));
+
+    // Figure out which variables live in registers.
+    // Also, initialize gcTrkStkPtrLcls to include all tracked variables that do not fully live
+    // in a register (i.e. they live on the stack for all or part of their lifetime).
+    // Note that lvRegister indicates that a lclVar is in a register for its entire lifetime.
+
+    VarSetOps::AssignNoCopy(compiler, compiler->raRegVarsMask, VarSetOps::MakeEmpty(compiler));
+
+    for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->lvaCount; varNum++, varDsc++)
+    {
+        if (varDsc->lvTracked
+#ifndef LEGACY_BACKEND
+            || varDsc->lvIsRegCandidate()
+#endif // !LEGACY_BACKEND
+                )
+        {
+            if (varDsc->lvRegister
+#if FEATURE_STACK_FP_X87
+                && !varDsc->IsFloatRegType()
+#endif
+                    )
+            {
+                VarSetOps::AddElemD(compiler, compiler->raRegVarsMask, varDsc->lvVarIndex);
+            }
+            else if (compiler->lvaIsGCTracked(varDsc) && (!varDsc->lvIsParam || varDsc->lvIsRegArg))
+            {
+                VarSetOps::AddElemD(compiler, gcInfo.gcTrkStkPtrLcls, varDsc->lvVarIndex);
+            }
+        }
+    }
+    VarSetOps::AssignNoCopy(compiler, genLastLiveSet, VarSetOps::MakeEmpty(compiler));
+    genLastLiveMask = RBM_NONE;
+#ifdef DEBUG
+    compiler->fgBBcountAtCodegen = compiler->fgBBcount;
+#endif
+}
+
+/*****************************************************************************
+ *  To report exception handling information to the VM, we need the size of the exception
+ *  handling regions. To compute that, we need to emit labels for the beginning block of
+ *  an EH region, and the block that immediately follows a region. Go through the EH
+ *  table and mark all these blocks with BBF_HAS_LABEL to make this happen.
+ *
+ *  The beginning blocks of the EH regions already should have this flag set.
+ *
+ *  No blocks should be added or removed after this.
+ *
+ *  This code is closely couple with genReportEH() in the sense that any block
+ *  that this procedure has determined it needs to have a label has to be selected
+ *  using the same logic both here and in genReportEH(), so basically any time there is
+ *  a change in the way we handle EH reporting, we have to keep the logic of these two
+ *  methods 'in sync'.
+ */
+
+void CodeGen::genPrepForEHCodegen()
+{
+    assert(!compiler->fgSafeBasicBlockCreation);
+
+    EHblkDsc* HBtab;
+    EHblkDsc* HBtabEnd;
+
+    bool anyFinallys = false;
+
+    for (HBtab = compiler->compHndBBtab, HBtabEnd = compiler->compHndBBtab + compiler->compHndBBtabCount;
+         HBtab < HBtabEnd; HBtab++)
+    {
+        assert(HBtab->ebdTryBeg->bbFlags & BBF_HAS_LABEL);
+        assert(HBtab->ebdHndBeg->bbFlags & BBF_HAS_LABEL);
+
+        if (HBtab->ebdTryLast->bbNext != nullptr)
+        {
+            HBtab->ebdTryLast->bbNext->bbFlags |= BBF_HAS_LABEL;
+        }
+
+        if (HBtab->ebdHndLast->bbNext != nullptr)
+        {
+            HBtab->ebdHndLast->bbNext->bbFlags |= BBF_HAS_LABEL;
+        }
+
+        if (HBtab->HasFilter())
+        {
+            assert(HBtab->ebdFilter->bbFlags & BBF_HAS_LABEL);
+            // The block after the last block of the filter is
+            // the handler begin block, which we already asserted
+            // has BBF_HAS_LABEL set.
+        }
+
+#ifdef _TARGET_AMD64_
+        if (HBtab->HasFinallyHandler())
+        {
+            anyFinallys = true;
+        }
+#endif // _TARGET_AMD64_
+    }
+
+#ifdef _TARGET_AMD64_
+    if (anyFinallys)
+    {
+        for (BasicBlock* block = compiler->fgFirstBB; block != nullptr; block = block->bbNext)
+        {
+            if (block->bbJumpKind == BBJ_CALLFINALLY)
+            {
+                BasicBlock* bbToLabel = block->bbNext;
+                if (block->isBBCallAlwaysPair())
+                {
+                    bbToLabel = bbToLabel->bbNext; // skip the BBJ_ALWAYS
+                }
+                if (bbToLabel != nullptr)
+                {
+                    bbToLabel->bbFlags |= BBF_HAS_LABEL;
+                }
+            } // block is BBJ_CALLFINALLY
+        }     // for each block
+    }         // if (anyFinallys)
+#endif        // _TARGET_AMD64_
+}
+
+void CodeGenInterface::genUpdateLife(GenTreePtr tree)
+{
+    compiler->compUpdateLife</*ForCodeGen*/ true>(tree);
+}
+
+void CodeGenInterface::genUpdateLife(VARSET_VALARG_TP newLife)
+{
+    compiler->compUpdateLife</*ForCodeGen*/ true>(newLife);
+}
+
+#ifdef LEGACY_BACKEND
+// Returns the liveSet after tree has executed.
+// "tree" MUST occur in the current statement, AFTER the most recent
+// update of compiler->compCurLifeTree and compiler->compCurLife.
+//
+VARSET_VALRET_TP CodeGen::genUpdateLiveSetForward(GenTreePtr tree)
+{
+    VARSET_TP  VARSET_INIT(compiler, startLiveSet, compiler->compCurLife);
+    GenTreePtr startNode;
+    assert(tree != compiler->compCurLifeTree);
+    if (compiler->compCurLifeTree == nullptr)
+    {
+        assert(compiler->compCurStmt != nullptr);
+        startNode = compiler->compCurStmt->gtStmt.gtStmtList;
+    }
+    else
+    {
+        startNode = compiler->compCurLifeTree->gtNext;
+    }
+    return compiler->fgUpdateLiveSet(startLiveSet, startNode, tree);
+}
+
+// Determine the registers that are live after "second" has been evaluated,
+// but which are not live after "first".
+// PRECONDITIONS:
+// 1. "first" must occur after compiler->compCurLifeTree in execution order for the current statement
+// 2. "second" must occur after "first" in the current statement
+//
+regMaskTP CodeGen::genNewLiveRegMask(GenTreePtr first, GenTreePtr second)
+{
+    // First, compute the liveset after "first"
+    VARSET_TP firstLiveSet = genUpdateLiveSetForward(first);
+    // Now, update the set forward from "first" to "second"
+    VARSET_TP secondLiveSet = compiler->fgUpdateLiveSet(firstLiveSet, first->gtNext, second);
+    regMaskTP newLiveMask   = genLiveMask(VarSetOps::Diff(compiler, secondLiveSet, firstLiveSet));
+    return newLiveMask;
+}
+#endif
+
+// Return the register mask for the given register variable
+// inline
+regMaskTP CodeGenInterface::genGetRegMask(const LclVarDsc* varDsc)
+{
+    regMaskTP regMask = RBM_NONE;
+
+    assert(varDsc->lvIsInReg());
+
+    if (varTypeIsFloating(varDsc->TypeGet()))
+    {
+        regMask = genRegMaskFloat(varDsc->lvRegNum, varDsc->TypeGet());
+    }
+    else
+    {
+        regMask = genRegMask(varDsc->lvRegNum);
+        if (isRegPairType(varDsc->lvType))
+        {
+            regMask |= genRegMask(varDsc->lvOtherReg);
+        }
+    }
+    return regMask;
+}
+
+// Return the register mask for the given lclVar or regVar tree node
+// inline
+regMaskTP CodeGenInterface::genGetRegMask(GenTreePtr tree)
+{
+    assert(tree->gtOper == GT_LCL_VAR || tree->gtOper == GT_REG_VAR);
+
+    regMaskTP        regMask = RBM_NONE;
+    const LclVarDsc* varDsc  = compiler->lvaTable + tree->gtLclVarCommon.gtLclNum;
+    if (varDsc->lvPromoted)
+    {
+        for (unsigned i = varDsc->lvFieldLclStart; i < varDsc->lvFieldLclStart + varDsc->lvFieldCnt; ++i)
+        {
+            noway_assert(compiler->lvaTable[i].lvIsStructField);
+            if (compiler->lvaTable[i].lvIsInReg())
+            {
+                regMask |= genGetRegMask(&compiler->lvaTable[i]);
+            }
+        }
+    }
+    else if (varDsc->lvIsInReg())
+    {
+        regMask = genGetRegMask(varDsc);
+    }
+    return regMask;
+}
+
+//------------------------------------------------------------------------
+// getRegistersFromMask: Given a register mask return the two registers
+//                       specified by the mask.
+//
+// Arguments:
+//    regPairMask:  a register mask that has exactly two bits set
+// Return values:
+//    pLoReg:       the address of where to write the first register
+//    pHiReg:       the address of where to write the second register
+//
+void CodeGenInterface::genGetRegPairFromMask(regMaskTP regPairMask, regNumber* pLoReg, regNumber* pHiReg)
+{
+    assert(genCountBits(regPairMask) == 2);
+
+    regMaskTP loMask = genFindLowestBit(regPairMask); // set loMask to a one-bit mask
+    regMaskTP hiMask = regPairMask - loMask;          // set hiMask to the other bit that was in tmpRegMask
+
+    regNumber loReg = genRegNumFromMask(loMask); // set loReg from loMask
+    regNumber hiReg = genRegNumFromMask(hiMask); // set hiReg from hiMask
+
+    *pLoReg = loReg;
+    *pHiReg = hiReg;
+}
+
+// The given lclVar is either going live (being born) or dying.
+// It might be both going live and dying (that is, it is a dead store) under MinOpts.
+// Update regSet.rsMaskVars accordingly.
+// inline
+void CodeGenInterface::genUpdateRegLife(const LclVarDsc* varDsc, bool isBorn, bool isDying DEBUGARG(GenTreePtr tree))
+{
+#if FEATURE_STACK_FP_X87
+    // The stack fp reg vars are handled elsewhere
+    if (varTypeIsFloating(varDsc->TypeGet()))
+        return;
+#endif
+
+    regMaskTP regMask = genGetRegMask(varDsc);
+
+#ifdef DEBUG
+    if (compiler->verbose)
+    {
+        printf("\t\t\t\t\t\t\tV%02u in reg ", (varDsc - compiler->lvaTable));
+        varDsc->PrintVarReg();
+        printf(" is becoming %s  ", (isDying) ? "dead" : "live");
+        Compiler::printTreeID(tree);
+        printf("\n");
+    }
+#endif // DEBUG
+
+    if (isDying)
+    {
+        // We'd like to be able to assert the following, however if we are walking
+        // through a qmark/colon tree, we may encounter multiple last-use nodes.
+        // assert((regSet.rsMaskVars & regMask) == regMask);
+        regSet.RemoveMaskVars(regMask);
+    }
+    else
+    {
+        assert((regSet.rsMaskVars & regMask) == 0);
+        regSet.AddMaskVars(regMask);
+    }
+}
+
+// Gets a register mask that represent the kill set for a helper call since
+// not all JIT Helper calls follow the standard ABI on the target architecture.
+//
+// TODO-CQ: Currently this list is incomplete (not all helpers calls are
+//          enumerated) and not 100% accurate (some killsets are bigger than
+//          what they really are).
+//          There's some work to be done in several places in the JIT to
+//          accurately track the registers that are getting killed by
+//          helper calls:
+//              a) LSRA needs several changes to accomodate more precise killsets
+//                 for every helper call it sees (both explicitly [easy] and
+//                 implicitly [hard])
+//              b) Currently for AMD64, when we generate code for a helper call
+//                 we're independently over-pessimizing the killsets of the call
+//                 (independently from LSRA) and this needs changes
+//                 both in CodeGenAmd64.cpp and emitx86.cpp.
+//
+//                 The best solution for this problem would be to try to centralize
+//                 the killset information in a single place but then make the
+//                 corresponding changes so every code generation phase is in sync
+//                 about this.
+//
+//         The interim solution is to only add known helper calls that don't
+//         follow the AMD64 ABI and actually trash registers that are supposed to be non-volatile.
+regMaskTP Compiler::compHelperCallKillSet(CorInfoHelpFunc helper)
+{
+    switch (helper)
+    {
+        case CORINFO_HELP_ASSIGN_BYREF:
+#if defined(_TARGET_AMD64_)
+            return RBM_RSI | RBM_RDI | RBM_CALLEE_TRASH;
+#elif defined(_TARGET_ARM64_)
+            return RBM_CALLEE_TRASH_NOGC;
+#else
+            NYI("Model kill set for CORINFO_HELP_ASSIGN_BYREF on target arch");
+            return RBM_CALLEE_TRASH;
+#endif
+
+        case CORINFO_HELP_PROF_FCN_ENTER:
+#ifdef _TARGET_AMD64_
+            return RBM_PROFILER_ENTER_TRASH;
+#else
+            unreached();
+#endif
+        case CORINFO_HELP_PROF_FCN_LEAVE:
+        case CORINFO_HELP_PROF_FCN_TAILCALL:
+#ifdef _TARGET_AMD64_
+            return RBM_PROFILER_LEAVE_TRASH;
+#else
+            unreached();
+#endif
+
+        case CORINFO_HELP_STOP_FOR_GC:
+            return RBM_STOP_FOR_GC_TRASH;
+
+        case CORINFO_HELP_INIT_PINVOKE_FRAME:
+            return RBM_INIT_PINVOKE_FRAME_TRASH;
+
+        default:
+            return RBM_CALLEE_TRASH;
+    }
+}
+
+//
+// Gets a register mask that represents the kill set for "NO GC" helper calls since
+// not all JIT Helper calls follow the standard ABI on the target architecture.
+//
+// Note: This list may not be complete and defaults to the default NOGC registers.
+//
+regMaskTP Compiler::compNoGCHelperCallKillSet(CorInfoHelpFunc helper)
+{
+    assert(emitter::emitNoGChelper(helper));
+#ifdef _TARGET_AMD64_
+    switch (helper)
+    {
+        case CORINFO_HELP_PROF_FCN_ENTER:
+            return RBM_PROFILER_ENTER_TRASH;
+
+        case CORINFO_HELP_PROF_FCN_LEAVE:
+        case CORINFO_HELP_PROF_FCN_TAILCALL:
+            return RBM_PROFILER_LEAVE_TRASH;
+
+        case CORINFO_HELP_ASSIGN_BYREF:
+            // this helper doesn't trash RSI and RDI
+            return RBM_CALLEE_TRASH_NOGC & ~(RBM_RSI | RBM_RDI);
+
+        default:
+            return RBM_CALLEE_TRASH_NOGC;
+    }
+#else
+    return RBM_CALLEE_TRASH_NOGC;
+#endif
+}
+
+// Update liveness (always var liveness, i.e., compCurLife, and also, if "ForCodeGen" is true, reg liveness, i.e.,
+// regSet.rsMaskVars as well)
+// if the given lclVar (or indir(addr(local)))/regVar node is going live (being born) or dying.
+template <bool ForCodeGen>
+void Compiler::compUpdateLifeVar(GenTreePtr tree, VARSET_TP* pLastUseVars)
+{
+    GenTreePtr indirAddrLocal = fgIsIndirOfAddrOfLocal(tree);
+    assert(tree->OperIsNonPhiLocal() || indirAddrLocal != nullptr);
+
+    // Get the local var tree -- if "tree" is "Ldobj(addr(x))", or "ind(addr(x))" this is "x", else it's "tree".
+    GenTreePtr lclVarTree = indirAddrLocal;
+    if (lclVarTree == nullptr)
+    {
+        lclVarTree = tree;
+    }
+    unsigned int lclNum = lclVarTree->gtLclVarCommon.gtLclNum;
+    LclVarDsc*   varDsc = lvaTable + lclNum;
+
+#ifdef DEBUG
+#if !defined(_TARGET_AMD64_)
+    // There are no addr nodes on ARM and we are experimenting with encountering vars in 'random' order.
+    // Struct fields are not traversed in a consistent order, so ignore them when
+    // verifying that we see the var nodes in execution order
+    if (ForCodeGen)
+    {
+        if (tree->OperIsIndir())
+        {
+            assert(indirAddrLocal != NULL);
+        }
+        else if (tree->gtNext != NULL && tree->gtNext->gtOper == GT_ADDR &&
+                 ((tree->gtNext->gtNext == NULL || !tree->gtNext->gtNext->OperIsIndir())))
+        {
+            assert(tree->IsLocal()); // Can only take the address of a local.
+            // The ADDR might occur in a context where the address it contributes is eventually
+            // dereferenced, so we can't say that this is not a use or def.
+        }
+#if 0   
+        // TODO-ARM64-Bug?: These asserts don't seem right for ARM64: I don't understand why we have to assert 
+        // two consecutive lclvars (in execution order) can only be observed if the first one is a struct field.
+        // It seems to me this is code only applicable to the legacy JIT and not RyuJIT (and therefore why it was 
+        // ifdef'ed out for AMD64).
+        else if (!varDsc->lvIsStructField)
+        {
+            GenTreePtr prevTree;
+            for (prevTree = tree->gtPrev;
+                 prevTree != NULL && prevTree != compCurLifeTree;
+                 prevTree = prevTree->gtPrev)
+            {
+                if ((prevTree->gtOper == GT_LCL_VAR) || (prevTree->gtOper == GT_REG_VAR))
+                {
+                    LclVarDsc * prevVarDsc = lvaTable + prevTree->gtLclVarCommon.gtLclNum;
+
+                    // These are the only things for which this method MUST be called
+                    assert(prevVarDsc->lvIsStructField);
+                }
+            }
+            assert(prevTree == compCurLifeTree);
+        }
+#endif // 0
+    }
+#endif // !_TARGET_AMD64_
+#endif // DEBUG
+
+    compCurLifeTree = tree;
+    VARSET_TP VARSET_INIT(this, newLife, compCurLife);
+
+    // By codegen, a struct may not be TYP_STRUCT, so we have to
+    // check lvPromoted, for the case where the fields are being
+    // tracked.
+    if (!varDsc->lvTracked && !varDsc->lvPromoted)
+    {
+        return;
+    }
+
+    bool isBorn = ((tree->gtFlags & GTF_VAR_DEF) != 0 && (tree->gtFlags & GTF_VAR_USEASG) == 0); // if it's "x <op>=
+                                                                                                 // ..." then variable
+                                                                                                 // "x" must have had a
+                                                                                                 // previous, original,
+                                                                                                 // site to be born.
+    bool isDying = ((tree->gtFlags & GTF_VAR_DEATH) != 0);
+#ifndef LEGACY_BACKEND
+    bool spill = ((tree->gtFlags & GTF_SPILL) != 0);
+#endif // !LEGACY_BACKEND
+
+#ifndef LEGACY_BACKEND
+    // For RyuJIT backend, since all tracked vars are register candidates, but not all are in registers at all times,
+    // we maintain two separate sets of variables - the total set of variables that are either
+    // born or dying here, and the subset of those that are on the stack
+    VARSET_TP VARSET_INIT_NOCOPY(stackVarDeltaSet, VarSetOps::MakeEmpty(this));
+#endif // !LEGACY_BACKEND
+
+    if (isBorn || isDying)
+    {
+        bool hasDeadTrackedFieldVars = false; // If this is true, then, for a LDOBJ(ADDR(<promoted struct local>)),
+        VARSET_TP* deadTrackedFieldVars =
+            nullptr; // *deadTrackedFieldVars indicates which tracked field vars are dying.
+        VARSET_TP VARSET_INIT_NOCOPY(varDeltaSet, VarSetOps::MakeEmpty(this));
+
+        if (varDsc->lvTracked)
+        {
+            VarSetOps::AddElemD(this, varDeltaSet, varDsc->lvVarIndex);
+            if (ForCodeGen)
+            {
+#ifndef LEGACY_BACKEND
+                if (isBorn && varDsc->lvIsRegCandidate() && tree->gtHasReg())
+                {
+                    codeGen->genUpdateVarReg(varDsc, tree);
+                }
+#endif // !LEGACY_BACKEND
+                if (varDsc->lvIsInReg()
+#ifndef LEGACY_BACKEND
+                    && tree->gtRegNum != REG_NA
+#endif // !LEGACY_BACKEND
+                    )
+                {
+                    codeGen->genUpdateRegLife(varDsc, isBorn, isDying DEBUGARG(tree));
+                }
+#ifndef LEGACY_BACKEND
+                else
+                {
+                    VarSetOps::AddElemD(this, stackVarDeltaSet, varDsc->lvVarIndex);
+                }
+#endif // !LEGACY_BACKEND
+            }
+        }
+        else if (varDsc->lvPromoted)
+        {
+            if (indirAddrLocal != nullptr && isDying)
+            {
+                assert(!isBorn); // GTF_VAR_DEATH only set for LDOBJ last use.
+                hasDeadTrackedFieldVars = GetPromotedStructDeathVars()->Lookup(indirAddrLocal, &deadTrackedFieldVars);
+                if (hasDeadTrackedFieldVars)
+                {
+                    VarSetOps::Assign(this, varDeltaSet, *deadTrackedFieldVars);
+                }
+            }
+
+            for (unsigned i = varDsc->lvFieldLclStart; i < varDsc->lvFieldLclStart + varDsc->lvFieldCnt; ++i)
+            {
+                LclVarDsc* fldVarDsc = &(lvaTable[i]);
+                noway_assert(fldVarDsc->lvIsStructField);
+                if (fldVarDsc->lvTracked)
+                {
+                    unsigned fldVarIndex = fldVarDsc->lvVarIndex;
+                    noway_assert(fldVarIndex < lvaTrackedCount);
+                    if (!hasDeadTrackedFieldVars)
+                    {
+                        VarSetOps::AddElemD(this, varDeltaSet, fldVarIndex);
+                        if (ForCodeGen)
+                        {
+                            // We repeat this call here and below to avoid the VarSetOps::IsMember
+                            // test in this, the common case, where we have no deadTrackedFieldVars.
+                            if (fldVarDsc->lvIsInReg())
+                            {
+#ifndef LEGACY_BACKEND
+                                if (isBorn)
+                                {
+                                    codeGen->genUpdateVarReg(fldVarDsc, tree);
+                                }
+#endif // !LEGACY_BACKEND
+                                codeGen->genUpdateRegLife(fldVarDsc, isBorn, isDying DEBUGARG(tree));
+                            }
+#ifndef LEGACY_BACKEND
+                            else
+                            {
+                                VarSetOps::AddElemD(this, stackVarDeltaSet, fldVarIndex);
+                            }
+#endif // !LEGACY_BACKEND
+                        }
+                    }
+                    else if (ForCodeGen && VarSetOps::IsMember(this, varDeltaSet, fldVarIndex))
+                    {
+                        if (lvaTable[i].lvIsInReg())
+                        {
+#ifndef LEGACY_BACKEND
+                            if (isBorn)
+                            {
+                                codeGen->genUpdateVarReg(fldVarDsc, tree);
+                            }
+#endif // !LEGACY_BACKEND
+                            codeGen->genUpdateRegLife(fldVarDsc, isBorn, isDying DEBUGARG(tree));
+                        }
+#ifndef LEGACY_BACKEND
+                        else
+                        {
+                            VarSetOps::AddElemD(this, stackVarDeltaSet, fldVarIndex);
+                        }
+#endif // !LEGACY_BACKEND
+                    }
+                }
+            }
+        }
+
+        // First, update the live set
+        if (isDying)
+        {
+            // We'd like to be able to assert the following, however if we are walking
+            // through a qmark/colon tree, we may encounter multiple last-use nodes.
+            // assert (VarSetOps::IsSubset(compiler, regVarDeltaSet, newLife));
+            VarSetOps::DiffD(this, newLife, varDeltaSet);
+            if (pLastUseVars != nullptr)
+            {
+                VarSetOps::Assign(this, *pLastUseVars, varDeltaSet);
+            }
+        }
+        else
+        {
+            // This shouldn't be in newLife, unless this is debug code, in which
+            // case we keep vars live everywhere, OR the variable is address-exposed,
+            // OR this block is part of a try block, in which case it may be live at the handler
+            // Could add a check that, if it's in newLife, that it's also in
+            // fgGetHandlerLiveVars(compCurBB), but seems excessive
+            //
+            // For a dead store, it can be the case that we set both isBorn and isDying to true.
+            // (We don't eliminate dead stores under MinOpts, so we can't assume they're always
+            // eliminated.)  If it's both, we handled it above.
+            VarSetOps::UnionD(this, newLife, varDeltaSet);
+        }
+    }
+
+    if (!VarSetOps::Equal(this, compCurLife, newLife))
+    {
+#ifdef DEBUG
+        if (verbose)
+        {
+            printf("\t\t\t\t\t\t\tLive vars: ");
+            dumpConvertedVarSet(this, compCurLife);
+            printf(" => ");
+            dumpConvertedVarSet(this, newLife);
+            printf("\n");
+        }
+#endif // DEBUG
+
+        VarSetOps::Assign(this, compCurLife, newLife);
+
+        if (ForCodeGen)
+        {
+#ifndef LEGACY_BACKEND
+
+            // Only add vars to the gcInfo.gcVarPtrSetCur if they are currently on stack, since the
+            // gcInfo.gcTrkStkPtrLcls
+            // includes all TRACKED vars that EVER live on the stack (i.e. are not always in a register).
+            VARSET_TP VARSET_INIT_NOCOPY(gcTrkStkDeltaSet,
+                                         VarSetOps::Intersection(this, codeGen->gcInfo.gcTrkStkPtrLcls,
+                                                                 stackVarDeltaSet));
+            if (!VarSetOps::IsEmpty(this, gcTrkStkDeltaSet))
+            {
+#ifdef DEBUG
+                if (verbose)
+                {
+                    printf("\t\t\t\t\t\t\tGCvars: ");
+                    dumpConvertedVarSet(this, codeGen->gcInfo.gcVarPtrSetCur);
+                    printf(" => ");
+                }
+#endif // DEBUG
+
+                if (isBorn)
+                {
+                    VarSetOps::UnionD(this, codeGen->gcInfo.gcVarPtrSetCur, gcTrkStkDeltaSet);
+                }
+                else
+                {
+                    VarSetOps::DiffD(this, codeGen->gcInfo.gcVarPtrSetCur, gcTrkStkDeltaSet);
+                }
+
+#ifdef DEBUG
+                if (verbose)
+                {
+                    dumpConvertedVarSet(this, codeGen->gcInfo.gcVarPtrSetCur);
+                    printf("\n");
+                }
+#endif // DEBUG
+            }
+
+#else // LEGACY_BACKEND
+
+#ifdef DEBUG
+            if (verbose)
+            {
+                VARSET_TP VARSET_INIT_NOCOPY(gcVarPtrSetNew,
+                                             VarSetOps::Intersection(this, newLife, codeGen->gcInfo.gcTrkStkPtrLcls));
+                if (!VarSetOps::Equal(this, codeGen->gcInfo.gcVarPtrSetCur, gcVarPtrSetNew))
+                {
+                    printf("\t\t\t\t\t\t\tGCvars: ");
+                    dumpConvertedVarSet(this, codeGen->gcInfo.gcVarPtrSetCur);
+                    printf(" => ");
+                    dumpConvertedVarSet(this, gcVarPtrSetNew);
+                    printf("\n");
+                }
+            }
+#endif // DEBUG
+
+            VarSetOps::AssignNoCopy(this, codeGen->gcInfo.gcVarPtrSetCur,
+                                    VarSetOps::Intersection(this, newLife, codeGen->gcInfo.gcTrkStkPtrLcls));
+
+#endif // LEGACY_BACKEND
+
+#ifdef DEBUGGING_SUPPORT
+            codeGen->siUpdate();
+#endif
+        }
+    }
+
+#ifndef LEGACY_BACKEND
+    if (ForCodeGen && spill)
+    {
+        assert(!varDsc->lvPromoted);
+        codeGen->genSpillVar(tree);
+        if (VarSetOps::IsMember(this, codeGen->gcInfo.gcTrkStkPtrLcls, varDsc->lvVarIndex))
+        {
+            if (!VarSetOps::IsMember(this, codeGen->gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
+            {
+                VarSetOps::AddElemD(this, codeGen->gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+#ifdef DEBUG
+                if (verbose)
+                {
+                    printf("\t\t\t\t\t\t\tVar V%02u becoming live\n", varDsc - lvaTable);
+                }
+#endif // DEBUG
+            }
+        }
+    }
+#endif // !LEGACY_BACKEND
+}
+
+// Need an explicit instantiation.
+template void Compiler::compUpdateLifeVar<false>(GenTreePtr tree, VARSET_TP* pLastUseVars);
+
+template <bool ForCodeGen>
+void Compiler::compChangeLife(VARSET_VALARG_TP newLife DEBUGARG(GenTreePtr tree))
+{
+    LclVarDsc* varDsc;
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        if (tree != nullptr)
+        {
+            Compiler::printTreeID(tree);
+        }
+        printf("Change life %s ", VarSetOps::ToString(this, compCurLife));
+        dumpConvertedVarSet(this, compCurLife);
+        printf(" -> %s ", VarSetOps::ToString(this, newLife));
+        dumpConvertedVarSet(this, newLife);
+        printf("\n");
+    }
+#endif // DEBUG
+
+    /* We should only be called when the live set has actually changed */
+
+    noway_assert(!VarSetOps::Equal(this, compCurLife, newLife));
+
+    if (!ForCodeGen)
+    {
+        VarSetOps::Assign(this, compCurLife, newLife);
+        return;
+    }
+
+    /* Figure out which variables are becoming live/dead at this point */
+
+    // deadSet = compCurLife - newLife
+    VARSET_TP VARSET_INIT(this, deadSet, compCurLife);
+    VarSetOps::DiffD(this, deadSet, newLife);
+
+    // bornSet = newLife - compCurLife
+    VARSET_TP VARSET_INIT(this, bornSet, newLife);
+    VarSetOps::DiffD(this, bornSet, compCurLife);
+
+    /* Can't simultaneously become live and dead at the same time */
+
+    // (deadSet UNION bornSet) != EMPTY
+    noway_assert(!VarSetOps::IsEmpty(this, VarSetOps::Union(this, deadSet, bornSet)));
+    // (deadSet INTERSECTION bornSet) == EMPTY
+    noway_assert(VarSetOps::IsEmpty(this, VarSetOps::Intersection(this, deadSet, bornSet)));
+
+#ifdef LEGACY_BACKEND
+    // In the LEGACY_BACKEND case, we only consider variables that are fully enregisterd
+    // and there may be none.
+    VarSetOps::IntersectionD(this, deadSet, raRegVarsMask);
+    VarSetOps::IntersectionD(this, bornSet, raRegVarsMask);
+    // And all gcTrkStkPtrLcls that are now live will be on the stack
+    VarSetOps::AssignNoCopy(this, codeGen->gcInfo.gcVarPtrSetCur,
+                            VarSetOps::Intersection(this, newLife, codeGen->gcInfo.gcTrkStkPtrLcls));
+#endif // LEGACY_BACKEND
+
+    VarSetOps::Assign(this, compCurLife, newLife);
+
+    // Handle the dying vars first, then the newly live vars.
+    // This is because, in the RyuJIT backend case, they may occupy registers that
+    // will be occupied by another var that is newly live.
+    VARSET_ITER_INIT(this, deadIter, deadSet, deadVarIndex);
+    while (deadIter.NextElem(this, &deadVarIndex))
+    {
+        unsigned varNum = lvaTrackedToVarNum[deadVarIndex];
+        varDsc          = lvaTable + varNum;
+        bool isGCRef    = (varDsc->TypeGet() == TYP_REF);
+        bool isByRef    = (varDsc->TypeGet() == TYP_BYREF);
+
+        if (varDsc->lvIsInReg())
+        {
+            // TODO-Cleanup: Move the code from compUpdateLifeVar to genUpdateRegLife that updates the
+            // gc sets
+            regMaskTP regMask = varDsc->lvRegMask();
+            if (isGCRef)
+            {
+                codeGen->gcInfo.gcRegGCrefSetCur &= ~regMask;
+            }
+            else if (isByRef)
+            {
+                codeGen->gcInfo.gcRegByrefSetCur &= ~regMask;
+            }
+            codeGen->genUpdateRegLife(varDsc, false /*isBorn*/, true /*isDying*/ DEBUGARG(tree));
+        }
+#ifndef LEGACY_BACKEND
+        // This isn't in a register, so update the gcVarPtrSetCur.
+        // (Note that in the LEGACY_BACKEND case gcVarPtrSetCur is updated above unconditionally
+        // for all gcTrkStkPtrLcls in newLife, because none of them ever live in a register.)
+        else if (isGCRef || isByRef)
+        {
+            VarSetOps::RemoveElemD(this, codeGen->gcInfo.gcVarPtrSetCur, deadVarIndex);
+            JITDUMP("\t\t\t\t\t\t\tV%02u becoming dead\n", varNum);
+        }
+#endif // !LEGACY_BACKEND
+    }
+
+    VARSET_ITER_INIT(this, bornIter, bornSet, bornVarIndex);
+    while (bornIter.NextElem(this, &bornVarIndex))
+    {
+        unsigned varNum = lvaTrackedToVarNum[bornVarIndex];
+        varDsc          = lvaTable + varNum;
+        bool isGCRef    = (varDsc->TypeGet() == TYP_REF);
+        bool isByRef    = (varDsc->TypeGet() == TYP_BYREF);
+
+        if (varDsc->lvIsInReg())
+        {
+#ifndef LEGACY_BACKEND
+#ifdef DEBUG
+            if (VarSetOps::IsMember(this, codeGen->gcInfo.gcVarPtrSetCur, bornVarIndex))
+            {
+                JITDUMP("\t\t\t\t\t\t\tRemoving V%02u from gcVarPtrSetCur\n", varNum);
+            }
+#endif // DEBUG
+            VarSetOps::RemoveElemD(this, codeGen->gcInfo.gcVarPtrSetCur, bornVarIndex);
+#endif // !LEGACY_BACKEND
+            codeGen->genUpdateRegLife(varDsc, true /*isBorn*/, false /*isDying*/ DEBUGARG(tree));
+            regMaskTP regMask = varDsc->lvRegMask();
+            if (isGCRef)
+            {
+                codeGen->gcInfo.gcRegGCrefSetCur |= regMask;
+            }
+            else if (isByRef)
+            {
+                codeGen->gcInfo.gcRegByrefSetCur |= regMask;
+            }
+        }
+#ifndef LEGACY_BACKEND
+        // This isn't in a register, so update the gcVarPtrSetCur
+        else if (lvaIsGCTracked(varDsc))
+        {
+            VarSetOps::AddElemD(this, codeGen->gcInfo.gcVarPtrSetCur, bornVarIndex);
+            JITDUMP("\t\t\t\t\t\t\tV%02u becoming live\n", varNum);
+        }
+#endif // !LEGACY_BACKEND
+    }
+
+#ifdef DEBUGGING_SUPPORT
+    codeGen->siUpdate();
+#endif
+}
+
+// Need an explicit instantiation.
+template void Compiler::compChangeLife<true>(VARSET_VALARG_TP newLife DEBUGARG(GenTreePtr tree));
+
+#ifdef LEGACY_BACKEND
+
+/*****************************************************************************
+ *
+ *  Get the mask of integer registers that contain 'live' enregistered
+ *  local variables after "tree".
+ *
+ *  The output is the mask of integer registers that are currently
+ *  alive and holding the enregistered local variables.
+ */
+regMaskTP CodeGenInterface::genLiveMask(GenTreePtr tree)
+{
+    regMaskTP liveMask = regSet.rsMaskVars;
+
+    GenTreePtr nextNode;
+    if (compiler->compCurLifeTree == nullptr)
+    {
+        assert(compiler->compCurStmt != nullptr);
+        nextNode = compiler->compCurStmt->gtStmt.gtStmtList;
+    }
+    else
+    {
+        nextNode = compiler->compCurLifeTree->gtNext;
+    }
+
+    // Theoretically, we should always be able to find "tree" by walking
+    // forward in execution order.  But unfortunately, there is at least
+    // one case (addressing) where a node may be evaluated out of order
+    // So, we have to handle that case
+    bool outOfOrder = false;
+    for (; nextNode != tree->gtNext; nextNode = nextNode->gtNext)
+    {
+        if (nextNode == nullptr)
+        {
+            outOfOrder = true;
+            break;
+        }
+        if (nextNode->gtOper == GT_LCL_VAR || nextNode->gtOper == GT_REG_VAR)
+        {
+            bool isBorn  = ((tree->gtFlags & GTF_VAR_DEF) != 0 && (tree->gtFlags & GTF_VAR_USEASG) == 0);
+            bool isDying = ((nextNode->gtFlags & GTF_VAR_DEATH) != 0);
+            if (isBorn || isDying)
+            {
+                regMaskTP regMask = genGetRegMask(nextNode);
+                if (regMask != RBM_NONE)
+                {
+                    if (isBorn)
+                    {
+                        liveMask |= regMask;
+                    }
+                    else
+                    {
+                        liveMask &= ~(regMask);
+                    }
+                }
+            }
+        }
+    }
+    if (outOfOrder)
+    {
+        assert(compiler->compCurLifeTree != nullptr);
+        liveMask = regSet.rsMaskVars;
+        // We were unable to find "tree" by traversing forward.  We must now go
+        // backward from compiler->compCurLifeTree instead.  We have to start with compiler->compCurLifeTree,
+        // since regSet.rsMaskVars reflects its completed execution
+        for (nextNode = compiler->compCurLifeTree; nextNode != tree; nextNode = nextNode->gtPrev)
+        {
+            assert(nextNode != nullptr);
+
+            if (nextNode->gtOper == GT_LCL_VAR || nextNode->gtOper == GT_REG_VAR)
+            {
+                bool isBorn  = ((tree->gtFlags & GTF_VAR_DEF) != 0 && (tree->gtFlags & GTF_VAR_USEASG) == 0);
+                bool isDying = ((nextNode->gtFlags & GTF_VAR_DEATH) != 0);
+                if (isBorn || isDying)
+                {
+                    regMaskTP regMask = genGetRegMask(nextNode);
+                    if (regMask != RBM_NONE)
+                    {
+                        // We're going backward - so things born are removed
+                        // and vice versa
+                        if (isBorn)
+                        {
+                            liveMask &= ~(regMask);
+                        }
+                        else
+                        {
+                            liveMask |= regMask;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return liveMask;
+}
+
+/*****************************************************************************
+ *
+ *  Get the mask of integer registers that contain 'live' enregistered
+ *  local variables.
+
+ *  The input is a liveSet which contains a set of local
+ *  variables that are currently alive
+ *
+ *  The output is the mask of x86 integer registers that are currently
+ *  alive and holding the enregistered local variables
+ */
+
+regMaskTP CodeGenInterface::genLiveMask(VARSET_VALARG_TP liveSet)
+{
+    // Check for the zero LiveSet mask
+    if (VarSetOps::IsEmpty(compiler, liveSet))
+    {
+        return RBM_NONE;
+    }
+
+    // set if our liveSet matches the one we have cached: genLastLiveSet -> genLastLiveMask
+    if (VarSetOps::Equal(compiler, liveSet, genLastLiveSet))
+    {
+        return genLastLiveMask;
+    }
+
+    regMaskTP liveMask = 0;
+
+    VARSET_ITER_INIT(compiler, iter, liveSet, varIndex);
+    while (iter.NextElem(compiler, &varIndex))
+    {
+
+        // If the variable is not enregistered, then it can't contribute to the liveMask
+        if (!VarSetOps::IsMember(compiler, compiler->raRegVarsMask, varIndex))
+        {
+            continue;
+        }
+
+        // Find the variable in compiler->lvaTable
+        unsigned   varNum = compiler->lvaTrackedToVarNum[varIndex];
+        LclVarDsc* varDsc = compiler->lvaTable + varNum;
+
+#if !FEATURE_FP_REGALLOC
+        // If the variable is a floating point type, then it can't contribute to the liveMask
+        if (varDsc->IsFloatRegType())
+        {
+            continue;
+        }
+#endif
+
+        noway_assert(compiler->lvaTable[varNum].lvRegister);
+        regMaskTP regBit;
+
+        if (varTypeIsFloating(varDsc->TypeGet()))
+        {
+            regBit = genRegMaskFloat(varDsc->lvRegNum, varDsc->TypeGet());
+        }
+        else
+        {
+            regBit = genRegMask(varDsc->lvRegNum);
+
+            // For longs we may have two regs
+            if (isRegPairType(varDsc->lvType) && varDsc->lvOtherReg != REG_STK)
+            {
+                regBit |= genRegMask(varDsc->lvOtherReg);
+            }
+        }
+
+        noway_assert(regBit != 0);
+
+        // We should not already have any of these bits set
+        noway_assert((liveMask & regBit) == 0);
+
+        // Update the liveMask with the register bits that are live
+        liveMask |= regBit;
+    }
+
+    // cache the last mapping between gtLiveSet -> liveMask
+    VarSetOps::Assign(compiler, genLastLiveSet, liveSet);
+    genLastLiveMask = liveMask;
+
+    return liveMask;
+}
+
+#endif
+
+/*****************************************************************************
+ *
+ *  Generate a spill.
+ */
+void CodeGenInterface::spillReg(var_types type, TempDsc* tmp, regNumber reg)
+{
+    getEmitter()->emitIns_S_R(ins_Store(type), emitActualTypeSize(type), reg, tmp->tdTempNum(), 0);
+}
+
+/*****************************************************************************
+ *
+ *  Generate a reload.
+ */
+void CodeGenInterface::reloadReg(var_types type, TempDsc* tmp, regNumber reg)
+{
+    getEmitter()->emitIns_R_S(ins_Load(type), emitActualTypeSize(type), reg, tmp->tdTempNum(), 0);
+}
+
+#ifdef LEGACY_BACKEND
+#if defined(_TARGET_ARM_) || defined(_TARGET_AMD64_)
+void CodeGenInterface::reloadFloatReg(var_types type, TempDsc* tmp, regNumber reg)
+{
+    var_types tmpType = tmp->tdTempType();
+    getEmitter()->emitIns_R_S(ins_FloatLoad(type), emitActualTypeSize(tmpType), reg, tmp->tdTempNum(), 0);
+}
+#endif
+#endif // LEGACY_BACKEND
+
+// inline
+regNumber CodeGenInterface::genGetThisArgReg(GenTreePtr call)
+{
+    noway_assert(call->IsCall());
+    return REG_ARG_0;
+}
+
+//----------------------------------------------------------------------
+// getSpillTempDsc: get the TempDsc corresponding to a spilled tree.
+//
+// Arguments:
+//   tree  -  spilled GenTree node
+//
+// Return Value:
+//   TempDsc corresponding to tree
+TempDsc* CodeGenInterface::getSpillTempDsc(GenTree* tree)
+{
+    // tree must be in spilled state.
+    assert((tree->gtFlags & GTF_SPILLED) != 0);
+
+    // Get the tree's SpillDsc.
+    RegSet::SpillDsc* prevDsc;
+    RegSet::SpillDsc* spillDsc = regSet.rsGetSpillInfo(tree, tree->gtRegNum, &prevDsc);
+    assert(spillDsc != nullptr);
+
+    // Get the temp desc.
+    TempDsc* temp = regSet.rsGetSpillTempWord(tree->gtRegNum, spillDsc, prevDsc);
+    return temp;
+}
+
+#ifdef _TARGET_XARCH_
+
+#ifdef _TARGET_AMD64_
+// Returns relocation type hint for an addr.
+// Note that there are no reloc hints on x86.
+//
+// Arguments
+//    addr  -  data address
+//
+// Returns
+//    relocation type hint
+//
+unsigned short CodeGenInterface::genAddrRelocTypeHint(size_t addr)
+{
+    return compiler->eeGetRelocTypeHint((void*)addr);
+}
+#endif //_TARGET_AMD64_
+
+// Return true if an absolute indirect data address can be encoded as IP-relative.
+// offset. Note that this method should be used only when the caller knows that
+// the address is an icon value that VM has given and there is no GenTree node
+// representing it. Otherwise, one should always use FitsInAddrBase().
+//
+// Arguments
+//    addr  -  an absolute indirect data address
+//
+// Returns
+//    true if indir data addr could be encoded as IP-relative offset.
+//
+bool CodeGenInterface::genDataIndirAddrCanBeEncodedAsPCRelOffset(size_t addr)
+{
+#ifdef _TARGET_AMD64_
+    return genAddrRelocTypeHint(addr) == IMAGE_REL_BASED_REL32;
+#else
+    // x86: PC-relative addressing is available only for control flow instructions (jmp and call)
+    return false;
+#endif
+}
+
+// Return true if an indirect code address can be encoded as IP-relative offset.
+// Note that this method should be used only when the caller knows that the
+// address is an icon value that VM has given and there is no GenTree node
+// representing it. Otherwise, one should always use FitsInAddrBase().
+//
+// Arguments
+//    addr  -  an absolute indirect code address
+//
+// Returns
+//    true if indir code addr could be encoded as IP-relative offset.
+//
+bool CodeGenInterface::genCodeIndirAddrCanBeEncodedAsPCRelOffset(size_t addr)
+{
+#ifdef _TARGET_AMD64_
+    return genAddrRelocTypeHint(addr) == IMAGE_REL_BASED_REL32;
+#else
+    // x86: PC-relative addressing is available only for control flow instructions (jmp and call)
+    return true;
+#endif
+}
+
+// Return true if an indirect code address can be encoded as 32-bit displacement
+// relative to zero. Note that this method should be used only when the caller
+// knows that the address is an icon value that VM has given and there is no
+// GenTree node representing it. Otherwise, one should always use FitsInAddrBase().
+//
+// Arguments
+//    addr  -  absolute indirect code address
+//
+// Returns
+//    true if absolute indir code addr could be encoded as 32-bit displacement relative to zero.
+//
+bool CodeGenInterface::genCodeIndirAddrCanBeEncodedAsZeroRelOffset(size_t addr)
+{
+    return GenTreeIntConCommon::FitsInI32((ssize_t)addr);
+}
+
+// Return true if an absolute indirect code address needs a relocation recorded with VM.
+//
+// Arguments
+//    addr  -  an absolute indirect code address
+//
+// Returns
+//    true if indir code addr needs a relocation recorded with VM
+//
+bool CodeGenInterface::genCodeIndirAddrNeedsReloc(size_t addr)
+{
+    // If generating relocatable ngen code, then all code addr should go through relocation
+    if (compiler->opts.compReloc)
+    {
+        return true;
+    }
+
+#ifdef _TARGET_AMD64_
+    // If code addr could be encoded as 32-bit offset relative to IP, we need to record a relocation.
+    if (genCodeIndirAddrCanBeEncodedAsPCRelOffset(addr))
+    {
+        return true;
+    }
+
+    // It could be possible that the code indir addr could be encoded as 32-bit displacement relative
+    // to zero.  But we don't need to emit a relocation in that case.
+    return false;
+#else  //_TARGET_X86_
+    // On x86 there is need for recording relocations during jitting,
+    // because all addrs fit within 32-bits.
+    return false;
+#endif //_TARGET_X86_
+}
+
+// Return true if a direct code address needs to be marked as relocatable.
+//
+// Arguments
+//    addr  -  absolute direct code address
+//
+// Returns
+//    true if direct code addr needs a relocation recorded with VM
+//
+bool CodeGenInterface::genCodeAddrNeedsReloc(size_t addr)
+{
+    // If generating relocatable ngen code, then all code addr should go through relocation
+    if (compiler->opts.compReloc)
+    {
+        return true;
+    }
+
+#ifdef _TARGET_AMD64_
+    // By default all direct code addresses go through relocation so that VM will setup
+    // a jump stub if addr cannot be encoded as pc-relative offset.
+    return true;
+#else  //_TARGET_X86_
+    // On x86 there is no need for recording relocations during jitting,
+    // because all addrs fit within 32-bits.
+    return false;
+#endif //_TARGET_X86_
+}
+#endif //_TARGET_XARCH_
+
+/*****************************************************************************
+ *
+ *  The following can be used to create basic blocks that serve as labels for
+ *  the emitter. Use with caution - these are not real basic blocks!
+ *
+ */
+
+// inline
+BasicBlock* CodeGen::genCreateTempLabel()
+{
+#ifdef DEBUG
+    // These blocks don't affect FP
+    compiler->fgSafeBasicBlockCreation = true;
+#endif
+
+    BasicBlock* block = compiler->bbNewBasicBlock(BBJ_NONE);
+
+#ifdef DEBUG
+    compiler->fgSafeBasicBlockCreation = false;
+#endif
+
+    block->bbFlags |= BBF_JMP_TARGET | BBF_HAS_LABEL;
+
+    // Use coldness of current block, as this label will
+    // be contained in it.
+    block->bbFlags |= (compiler->compCurBB->bbFlags & BBF_COLD);
+
+#ifdef DEBUG
+    block->bbTgtStkDepth = genStackLevel / sizeof(int);
+#endif
+    return block;
+}
+
+// inline
+void CodeGen::genDefineTempLabel(BasicBlock* label)
+{
+#ifdef DEBUG
+    if (compiler->opts.dspCode)
+    {
+        printf("\n      L_M%03u_BB%02u:\n", Compiler::s_compMethodsCount, label->bbNum);
+    }
+#endif
+
+    label->bbEmitCookie =
+        getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur);
+
+    /* gcInfo.gcRegGCrefSetCur does not account for redundant load-suppression
+       of GC vars, and the emitter will not know about */
+
+    regTracker.rsTrackRegClrPtr();
+}
+
+/*****************************************************************************
+ *
+ *  Adjust the stack pointer by the given value; assumes that this follows
+ *  a call so only callee-saved registers (and registers that may hold a
+ *  return value) are used at this point.
+ */
+
+void CodeGen::genAdjustSP(ssize_t delta)
+{
+#ifdef _TARGET_X86_
+    if (delta == sizeof(int))
+        inst_RV(INS_pop, REG_ECX, TYP_INT);
+    else
+#endif
+        inst_RV_IV(INS_add, REG_SPBASE, delta, EA_PTRSIZE);
+}
+
+#ifdef _TARGET_ARM_
+// return size
+// alignmentWB is out param
+unsigned CodeGenInterface::InferOpSizeAlign(GenTreePtr op, unsigned* alignmentWB)
+{
+    unsigned alignment = 0;
+    unsigned opSize    = 0;
+
+    if (op->gtType == TYP_STRUCT || op->OperIsCopyBlkOp())
+    {
+        opSize = InferStructOpSizeAlign(op, &alignment);
+    }
+    else
+    {
+        alignment = genTypeAlignments[op->TypeGet()];
+        opSize    = genTypeSizes[op->TypeGet()];
+    }
+
+    assert(opSize != 0);
+    assert(alignment != 0);
+
+    (*alignmentWB) = alignment;
+    return opSize;
+}
+// return size
+// alignmentWB is out param
+unsigned CodeGenInterface::InferStructOpSizeAlign(GenTreePtr op, unsigned* alignmentWB)
+{
+    unsigned alignment = 0;
+    unsigned opSize    = 0;
+
+    while (op->gtOper == GT_COMMA)
+    {
+        op = op->gtOp.gtOp2;
+    }
+
+    if (op->gtOper == GT_OBJ)
+    {
+        CORINFO_CLASS_HANDLE clsHnd = op->AsObj()->gtClass;
+        opSize                      = compiler->info.compCompHnd->getClassSize(clsHnd);
+        alignment = roundUp(compiler->info.compCompHnd->getClassAlignmentRequirement(clsHnd), TARGET_POINTER_SIZE);
+    }
+    else if (op->gtOper == GT_LCL_VAR)
+    {
+        unsigned   varNum = op->gtLclVarCommon.gtLclNum;
+        LclVarDsc* varDsc = compiler->lvaTable + varNum;
+        assert(varDsc->lvType == TYP_STRUCT);
+        opSize = varDsc->lvSize();
+        if (varDsc->lvStructDoubleAlign)
+        {
+            alignment = TARGET_POINTER_SIZE * 2;
+        }
+        else
+        {
+            alignment = TARGET_POINTER_SIZE;
+        }
+    }
+    else if (op->OperIsCopyBlkOp())
+    {
+        GenTreePtr op2 = op->gtOp.gtOp2;
+
+        if (op2->OperGet() == GT_CNS_INT)
+        {
+            if (op2->IsIconHandle(GTF_ICON_CLASS_HDL))
+            {
+                CORINFO_CLASS_HANDLE clsHnd = (CORINFO_CLASS_HANDLE)op2->gtIntCon.gtIconVal;
+                opSize = roundUp(compiler->info.compCompHnd->getClassSize(clsHnd), TARGET_POINTER_SIZE);
+                alignment =
+                    roundUp(compiler->info.compCompHnd->getClassAlignmentRequirement(clsHnd), TARGET_POINTER_SIZE);
+            }
+            else
+            {
+                opSize         = op2->gtIntCon.gtIconVal;
+                GenTreePtr op1 = op->gtOp.gtOp1;
+                assert(op1->OperGet() == GT_LIST);
+                GenTreePtr dstAddr = op1->gtOp.gtOp1;
+                if (dstAddr->OperGet() == GT_ADDR)
+                {
+                    InferStructOpSizeAlign(dstAddr->gtOp.gtOp1, &alignment);
+                }
+                else
+                {
+                    assert(!"Unhandle dstAddr node");
+                    alignment = TARGET_POINTER_SIZE;
+                }
+            }
+        }
+        else
+        {
+            noway_assert(!"Variable sized COPYBLK register arg!");
+            opSize    = 0;
+            alignment = TARGET_POINTER_SIZE;
+        }
+    }
+    else if (op->gtOper == GT_MKREFANY)
+    {
+        opSize    = TARGET_POINTER_SIZE * 2;
+        alignment = TARGET_POINTER_SIZE;
+    }
+    else if (op->IsArgPlaceHolderNode())
+    {
+        CORINFO_CLASS_HANDLE clsHnd = op->gtArgPlace.gtArgPlaceClsHnd;
+        assert(clsHnd != 0);
+        opSize    = roundUp(compiler->info.compCompHnd->getClassSize(clsHnd), TARGET_POINTER_SIZE);
+        alignment = roundUp(compiler->info.compCompHnd->getClassAlignmentRequirement(clsHnd), TARGET_POINTER_SIZE);
+    }
+    else
+    {
+        assert(!"Unhandled gtOper");
+        opSize    = TARGET_POINTER_SIZE;
+        alignment = TARGET_POINTER_SIZE;
+    }
+
+    assert(opSize != 0);
+    assert(alignment != 0);
+
+    (*alignmentWB) = alignment;
+    return opSize;
+}
+
+#endif // _TARGET_ARM_
+
+/*****************************************************************************
+ *
+ *  Take an address expression and try to find the best set of components to
+ *  form an address mode; returns non-zero if this is successful.
+ *
+ *  TODO-Cleanup: The RyuJIT backend never uses this to actually generate code.
+ *  Refactor this code so that the underlying analysis can be used in
+ *  the RyuJIT Backend to do lowering, instead of having to call this method with the
+ *  option to not generate the code.
+ *
+ *  'fold' specifies if it is OK to fold the array index which hangs off
+ *  a GT_NOP node.
+ *
+ *  If successful, the parameters will be set to the following values:
+ *
+ *      *rv1Ptr     ...     base operand
+ *      *rv2Ptr     ...     optional operand
+ *      *revPtr     ...     true if rv2 is before rv1 in the evaluation order
+ *  #if SCALED_ADDR_MODES
+ *      *mulPtr     ...     optional multiplier (2/4/8) for rv2
+ *                          Note that for [reg1 + reg2] and [reg1 + reg2 + icon], *mulPtr == 0.
+ *  #endif
+ *      *cnsPtr     ...     integer constant [optional]
+ *
+ *  The 'mode' parameter may have one of the following values:
+ *
+ *  #if LEA_AVAILABLE
+ *         +1       ...     we're trying to compute a value via 'LEA'
+ *  #endif
+ *
+ *          0       ...     we're trying to form an address mode
+ *
+ *         -1       ...     we're generating code for an address mode,
+ *                          and thus the address must already form an
+ *                          address mode (without any further work)
+ *
+ *  IMPORTANT NOTE: This routine doesn't generate any code, it merely
+ *                  identifies the components that might be used to
+ *                  form an address mode later on.
+ */
+
+bool CodeGen::genCreateAddrMode(GenTreePtr  addr,
+                                int         mode,
+                                bool        fold,
+                                regMaskTP   regMask,
+                                bool*       revPtr,
+                                GenTreePtr* rv1Ptr,
+                                GenTreePtr* rv2Ptr,
+#if SCALED_ADDR_MODES
+                                unsigned* mulPtr,
+#endif
+                                unsigned* cnsPtr,
+                                bool      nogen)
+{
+#ifndef LEGACY_BACKEND
+    assert(nogen == true);
+#endif // !LEGACY_BACKEND
+
+    /*
+        The following indirections are valid address modes on x86/x64:
+
+            [                  icon]      * not handled here
+            [reg                   ]      * not handled here
+            [reg             + icon]
+            [reg2 +     reg1       ]
+            [reg2 +     reg1 + icon]
+            [reg2 + 2 * reg1       ]
+            [reg2 + 4 * reg1       ]
+            [reg2 + 8 * reg1       ]
+            [       2 * reg1 + icon]
+            [       4 * reg1 + icon]
+            [       8 * reg1 + icon]
+            [reg2 + 2 * reg1 + icon]
+            [reg2 + 4 * reg1 + icon]
+            [reg2 + 8 * reg1 + icon]
+
+        The following indirections are valid address modes on arm64:
+
+            [reg]
+            [reg  + icon]
+            [reg2 + reg1]
+            [reg2 + reg1 * natural-scale]
+
+     */
+
+    /* All indirect address modes require the address to be an addition */
+
+    if (addr->gtOper != GT_ADD)
+    {
+        return false;
+    }
+
+    // Can't use indirect addressing mode as we need to check for overflow.
+    // Also, can't use 'lea' as it doesn't set the flags.
+
+    if (addr->gtOverflow())
+    {
+        return false;
+    }
+
+    GenTreePtr rv1 = nullptr;
+    GenTreePtr rv2 = nullptr;
+
+    GenTreePtr op1;
+    GenTreePtr op2;
+
+    ssize_t cns;
+#if SCALED_ADDR_MODES
+    unsigned mul;
+#endif
+
+    GenTreePtr tmp;
+
+    /* What order are the sub-operands to be evaluated */
+
+    if (addr->gtFlags & GTF_REVERSE_OPS)
+    {
+        op1 = addr->gtOp.gtOp2;
+        op2 = addr->gtOp.gtOp1;
+    }
+    else
+    {
+        op1 = addr->gtOp.gtOp1;
+        op2 = addr->gtOp.gtOp2;
+    }
+
+    bool rev = false; // Is op2 first in the evaluation order?
+
+    /*
+        A complex address mode can combine the following operands:
+
+            op1     ...     base address
+            op2     ...     optional scaled index
+#if SCALED_ADDR_MODES
+            mul     ...     optional multiplier (2/4/8) for op2
+#endif
+            cns     ...     optional displacement
+
+        Here we try to find such a set of operands and arrange for these
+        to sit in registers.
+     */
+
+    cns = 0;
+#if SCALED_ADDR_MODES
+    mul = 0;
+#endif
+
+AGAIN:
+    /* We come back to 'AGAIN' if we have an add of a constant, and we are folding that
+       constant, or we have gone through a GT_NOP or GT_COMMA node. We never come back
+       here if we find a scaled index.
+    */
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#if SCALED_ADDR_MODES
+    assert(mul == 0);
+#endif
+
+#ifdef LEGACY_BACKEND
+    /* Check both operands as far as being register variables */
+
+    if (mode != -1)
+    {
+        if (op1->gtOper == GT_LCL_VAR)
+            genMarkLclVar(op1);
+        if (op2->gtOper == GT_LCL_VAR)
+            genMarkLclVar(op2);
+    }
+#endif // LEGACY_BACKEND
+
+    /* Special case: keep constants as 'op2' */
+
+    if (op1->IsCnsIntOrI())
+    {
+        // Presumably op2 is assumed to not be a constant (shouldn't happen if we've done constant folding)?
+        tmp = op1;
+        op1 = op2;
+        op2 = tmp;
+    }
+
+    /* Check for an addition of a constant */
+
+    if (op2->IsIntCnsFitsInI32() && (op2->gtType != TYP_REF) && FitsIn<INT32>(cns + op2->gtIntConCommon.IconValue()))
+    {
+        /* We're adding a constant */
+
+        cns += op2->gtIntConCommon.IconValue();
+
+#ifdef LEGACY_BACKEND
+        /* Can (and should) we use "add reg, icon" ? */
+
+        if ((op1->gtFlags & GTF_REG_VAL) && mode == 1 && !nogen)
+        {
+            regNumber reg1 = op1->gtRegNum;
+
+            if ((regMask == 0 || (regMask & genRegMask(reg1))) && genRegTrashable(reg1, addr))
+            {
+                // In case genMarkLclVar(op1) bashed it above and it is
+                // the last use of the variable.
+
+                genUpdateLife(op1);
+
+                /* 'reg1' is trashable, so add "icon" into it */
+
+                genIncRegBy(reg1, cns, addr, addr->TypeGet());
+
+                genUpdateLife(addr);
+                return true;
+            }
+        }
+#endif // LEGACY_BACKEND
+
+#ifdef _TARGET_ARM64_
+        if (cns == 0)
+#endif
+        {
+            /* Inspect the operand the constant is being added to */
+
+            switch (op1->gtOper)
+            {
+                case GT_ADD:
+
+                    if (op1->gtOverflow())
+                    {
+                        break;
+                    }
+
+                    op2 = op1->gtOp.gtOp2;
+                    op1 = op1->gtOp.gtOp1;
+
+                    goto AGAIN;
+
+#if SCALED_ADDR_MODES && !defined(_TARGET_ARM64_)
+                // TODO-ARM64-CQ: For now we don't try to create a scaled index on ARM64.
+                case GT_MUL:
+                    if (op1->gtOverflow())
+                    {
+                        return false; // Need overflow check
+                    }
+
+                    __fallthrough;
+
+                case GT_LSH:
+
+                    mul = op1->GetScaledIndex();
+                    if (mul)
+                    {
+                        /* We can use "[mul*rv2 + icon]" */
+
+                        rv1 = nullptr;
+                        rv2 = op1->gtOp.gtOp1;
+
+                        goto FOUND_AM;
+                    }
+                    break;
+#endif
+
+                default:
+                    break;
+            }
+        }
+
+        /* The best we can do is "[rv1 + icon]" */
+
+        rv1 = op1;
+        rv2 = nullptr;
+
+        goto FOUND_AM;
+    }
+
+    /* op2 is not a constant. So keep on trying.
+       Does op1 or op2 already sit in a register? */
+
+    if (op1->gtFlags & GTF_REG_VAL)
+    {
+        /* op1 is sitting in a register */
+    }
+    else if (op2->gtFlags & GTF_REG_VAL)
+    {
+        /* op2 is sitting in a register. Keep the enregistered value as op1 */
+
+        tmp = op1;
+        op1 = op2;
+        op2 = tmp;
+
+        noway_assert(rev == false);
+        rev = true;
+    }
+    else
+    {
+        /* Neither op1 nor op2 are sitting in a register right now */
+
+        switch (op1->gtOper)
+        {
+#ifndef _TARGET_ARM64_
+            // TODO-ARM64-CQ: For now we don't try to create a scaled index on ARM64.
+            case GT_ADD:
+
+                if (op1->gtOverflow())
+                {
+                    break;
+                }
+
+                if (op1->gtOp.gtOp2->IsIntCnsFitsInI32() && FitsIn<INT32>(cns + op1->gtOp.gtOp2->gtIntCon.gtIconVal))
+                {
+                    cns += op1->gtOp.gtOp2->gtIntCon.gtIconVal;
+                    op1 = op1->gtOp.gtOp1;
+
+                    goto AGAIN;
+                }
+
+                break;
+
+#if SCALED_ADDR_MODES
+
+            case GT_MUL:
+
+                if (op1->gtOverflow())
+                {
+                    break;
+                }
+
+                __fallthrough;
+
+            case GT_LSH:
+
+                mul = op1->GetScaledIndex();
+                if (mul)
+                {
+                    /* 'op1' is a scaled value */
+
+                    rv1 = op2;
+                    rv2 = op1->gtOp.gtOp1;
+
+                    int argScale;
+                    while ((rv2->gtOper == GT_MUL || rv2->gtOper == GT_LSH) && (argScale = rv2->GetScaledIndex()) != 0)
+                    {
+                        if (jitIsScaleIndexMul(argScale * mul))
+                        {
+                            mul = mul * argScale;
+                            rv2 = rv2->gtOp.gtOp1;
+                        }
+                        else
+                        {
+                            break;
+                        }
+                    }
+
+                    noway_assert(rev == false);
+                    rev = true;
+
+                    goto FOUND_AM;
+                }
+                break;
+
+#endif // SCALED_ADDR_MODES
+#endif // !_TARGET_ARM64_
+
+            case GT_NOP:
+
+                if (!nogen)
+                {
+                    break;
+                }
+
+                op1 = op1->gtOp.gtOp1;
+                goto AGAIN;
+
+            case GT_COMMA:
+
+                if (!nogen)
+                {
+                    break;
+                }
+
+                op1 = op1->gtOp.gtOp2;
+                goto AGAIN;
+
+            default:
+                break;
+        }
+
+        noway_assert(op2);
+        switch (op2->gtOper)
+        {
+#ifndef _TARGET_ARM64_
+            // TODO-ARM64-CQ: For now we don't try to create a scaled index on ARM64.
+            case GT_ADD:
+
+                if (op2->gtOverflow())
+                {
+                    break;
+                }
+
+                if (op2->gtOp.gtOp2->IsIntCnsFitsInI32() && FitsIn<INT32>(cns + op2->gtOp.gtOp2->gtIntCon.gtIconVal))
+                {
+                    cns += op2->gtOp.gtOp2->gtIntCon.gtIconVal;
+                    op2 = op2->gtOp.gtOp1;
+
+                    goto AGAIN;
+                }
+
+                break;
+
+#if SCALED_ADDR_MODES
+
+            case GT_MUL:
+
+                if (op2->gtOverflow())
+                {
+                    break;
+                }
+
+                __fallthrough;
+
+            case GT_LSH:
+
+                mul = op2->GetScaledIndex();
+                if (mul)
+                {
+                    // 'op2' is a scaled value...is it's argument also scaled?
+                    int argScale;
+                    rv2 = op2->gtOp.gtOp1;
+                    while ((rv2->gtOper == GT_MUL || rv2->gtOper == GT_LSH) && (argScale = rv2->GetScaledIndex()) != 0)
+                    {
+                        if (jitIsScaleIndexMul(argScale * mul))
+                        {
+                            mul = mul * argScale;
+                            rv2 = rv2->gtOp.gtOp1;
+                        }
+                        else
+                        {
+                            break;
+                        }
+                    }
+
+                    rv1 = op1;
+
+                    goto FOUND_AM;
+                }
+                break;
+
+#endif // SCALED_ADDR_MODES
+#endif // !_TARGET_ARM64_
+
+            case GT_NOP:
+
+                if (!nogen)
+                {
+                    break;
+                }
+
+                op2 = op2->gtOp.gtOp1;
+                goto AGAIN;
+
+            case GT_COMMA:
+
+                if (!nogen)
+                {
+                    break;
+                }
+
+                op2 = op2->gtOp.gtOp2;
+                goto AGAIN;
+
+            default:
+                break;
+        }
+
+        goto ADD_OP12;
+    }
+
+    /* op1 is in a register.
+       Is op2 an addition or a scaled value? */
+
+    noway_assert(op2);
+
+#ifndef _TARGET_ARM64_
+    // TODO-ARM64-CQ: For now we don't try to create a scaled index on ARM64.
+    switch (op2->gtOper)
+    {
+        case GT_ADD:
+
+            if (op2->gtOverflow())
+            {
+                break;
+            }
+
+            if (op2->gtOp.gtOp2->IsIntCnsFitsInI32() && FitsIn<INT32>(cns + op2->gtOp.gtOp2->gtIntCon.gtIconVal))
+            {
+                cns += op2->gtOp.gtOp2->gtIntCon.gtIconVal;
+                op2 = op2->gtOp.gtOp1;
+                goto AGAIN;
+            }
+
+            break;
+
+#if SCALED_ADDR_MODES
+
+        case GT_MUL:
+
+            if (op2->gtOverflow())
+            {
+                break;
+            }
+
+            __fallthrough;
+
+        case GT_LSH:
+
+            mul = op2->GetScaledIndex();
+            if (mul)
+            {
+                rv1 = op1;
+                rv2 = op2->gtOp.gtOp1;
+                int argScale;
+                while ((rv2->gtOper == GT_MUL || rv2->gtOper == GT_LSH) && (argScale = rv2->GetScaledIndex()) != 0)
+                {
+                    if (jitIsScaleIndexMul(argScale * mul))
+                    {
+                        mul = mul * argScale;
+                        rv2 = rv2->gtOp.gtOp1;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+
+                goto FOUND_AM;
+            }
+            break;
+
+#endif // SCALED_ADDR_MODES
+
+        default:
+            break;
+    }
+#endif // !_TARGET_ARM64_
+
+ADD_OP12:
+
+    /* The best we can do "[rv1 + rv2]" or "[rv1 + rv2 + cns]" */
+
+    rv1 = op1;
+    rv2 = op2;
+#ifdef _TARGET_ARM64_
+    assert(cns == 0);
+#endif
+
+FOUND_AM:
+
+#ifdef LEGACY_BACKEND
+    /* Check for register variables */
+
+    if (mode != -1)
+    {
+        if (rv1 && rv1->gtOper == GT_LCL_VAR)
+            genMarkLclVar(rv1);
+        if (rv2 && rv2->gtOper == GT_LCL_VAR)
+            genMarkLclVar(rv2);
+    }
+#endif // LEGACY_BACKEND
+
+    if (rv2)
+    {
+        /* Make sure a GC address doesn't end up in 'rv2' */
+
+        if (varTypeIsGC(rv2->TypeGet()))
+        {
+            noway_assert(rv1 && !varTypeIsGC(rv1->TypeGet()));
+
+            tmp = rv1;
+            rv1 = rv2;
+            rv2 = tmp;
+
+            rev = !rev;
+        }
+
+        /* Special case: constant array index (that is range-checked) */
+
+        if (fold)
+        {
+            ssize_t    tmpMul;
+            GenTreePtr index;
+
+            if ((rv2->gtOper == GT_MUL || rv2->gtOper == GT_LSH) && (rv2->gtOp.gtOp2->IsCnsIntOrI()))
+            {
+                /* For valuetype arrays where we can't use the scaled address
+                   mode, rv2 will point to the scaled index. So we have to do
+                   more work */
+
+                tmpMul = compiler->optGetArrayRefScaleAndIndex(rv2, &index DEBUGARG(false));
+                if (mul)
+                {
+                    tmpMul *= mul;
+                }
+            }
+            else
+            {
+                /* May be a simple array. rv2 will points to the actual index */
+
+                index  = rv2;
+                tmpMul = mul;
+            }
+
+            /* Get hold of the array index and see if it's a constant */
+            if (index->IsIntCnsFitsInI32())
+            {
+                /* Get hold of the index value */
+                ssize_t ixv = index->AsIntConCommon()->IconValue();
+
+#if SCALED_ADDR_MODES
+                /* Scale the index if necessary */
+                if (tmpMul)
+                {
+                    ixv *= tmpMul;
+                }
+#endif
+
+                if (FitsIn<INT32>(cns + ixv))
+                {
+                    /* Add the scaled index to the offset value */
+
+                    cns += ixv;
+
+#if SCALED_ADDR_MODES
+                    /* There is no scaled operand any more */
+                    mul = 0;
+#endif
+                    rv2 = nullptr;
+                }
+            }
+        }
+    }
+
+    // We shouldn't have [rv2*1 + cns] - this is equivalent to [rv1 + cns]
+    noway_assert(rv1 || mul != 1);
+
+    noway_assert(FitsIn<INT32>(cns));
+
+    /* Success - return the various components to the caller */
+
+    *revPtr = rev;
+    *rv1Ptr = rv1;
+    *rv2Ptr = rv2;
+#if SCALED_ADDR_MODES
+    *mulPtr = mul;
+#endif
+    *cnsPtr = (unsigned)cns;
+
+    return true;
+}
+
+/*****************************************************************************
+*  The condition to use for (the jmp/set for) the given type of operation
+*
+*  In case of amd64, this routine should be used when there is no gentree available
+*  and one needs to generate jumps based on integer comparisons.  When gentree is
+*  available always use its overloaded version.
+*
+*/
+
+// static
+emitJumpKind CodeGen::genJumpKindForOper(genTreeOps cmp, CompareKind compareKind)
+{
+    const static BYTE genJCCinsSigned[] = {
+#if defined(_TARGET_XARCH_)
+        EJ_je,  // GT_EQ
+        EJ_jne, // GT_NE
+        EJ_jl,  // GT_LT
+        EJ_jle, // GT_LE
+        EJ_jge, // GT_GE
+        EJ_jg,  // GT_GT
+#elif defined(_TARGET_ARMARCH_)
+        EJ_eq,   // GT_EQ
+        EJ_ne,   // GT_NE
+        EJ_lt,   // GT_LT
+        EJ_le,   // GT_LE
+        EJ_ge,   // GT_GE
+        EJ_gt,   // GT_GT
+#endif
+    };
+
+    const static BYTE genJCCinsUnsigned[] = /* unsigned comparison */
+    {
+#if defined(_TARGET_XARCH_)
+        EJ_je,  // GT_EQ
+        EJ_jne, // GT_NE
+        EJ_jb,  // GT_LT
+        EJ_jbe, // GT_LE
+        EJ_jae, // GT_GE
+        EJ_ja,  // GT_GT
+#elif defined(_TARGET_ARMARCH_)
+        EJ_eq,   // GT_EQ
+        EJ_ne,   // GT_NE
+        EJ_lo,   // GT_LT
+        EJ_ls,   // GT_LE
+        EJ_hs,   // GT_GE
+        EJ_hi,   // GT_GT
+#endif
+    };
+
+    const static BYTE genJCCinsLogical[] = /* logical operation */
+    {
+#if defined(_TARGET_XARCH_)
+        EJ_je,   // GT_EQ   (Z == 1)
+        EJ_jne,  // GT_NE   (Z == 0)
+        EJ_js,   // GT_LT   (S == 1)
+        EJ_NONE, // GT_LE
+        EJ_jns,  // GT_GE   (S == 0)
+        EJ_NONE, // GT_GT
+#elif defined(_TARGET_ARMARCH_)
+        EJ_eq,   // GT_EQ   (Z == 1)
+        EJ_ne,   // GT_NE   (Z == 0)
+        EJ_mi,   // GT_LT   (N == 1)
+        EJ_NONE, // GT_LE
+        EJ_pl,   // GT_GE   (N == 0)
+        EJ_NONE, // GT_GT
+#endif
+    };
+
+#if defined(_TARGET_XARCH_)
+    assert(genJCCinsSigned[GT_EQ - GT_EQ] == EJ_je);
+    assert(genJCCinsSigned[GT_NE - GT_EQ] == EJ_jne);
+    assert(genJCCinsSigned[GT_LT - GT_EQ] == EJ_jl);
+    assert(genJCCinsSigned[GT_LE - GT_EQ] == EJ_jle);
+    assert(genJCCinsSigned[GT_GE - GT_EQ] == EJ_jge);
+    assert(genJCCinsSigned[GT_GT - GT_EQ] == EJ_jg);
+
+    assert(genJCCinsUnsigned[GT_EQ - GT_EQ] == EJ_je);
+    assert(genJCCinsUnsigned[GT_NE - GT_EQ] == EJ_jne);
+    assert(genJCCinsUnsigned[GT_LT - GT_EQ] == EJ_jb);
+    assert(genJCCinsUnsigned[GT_LE - GT_EQ] == EJ_jbe);
+    assert(genJCCinsUnsigned[GT_GE - GT_EQ] == EJ_jae);
+    assert(genJCCinsUnsigned[GT_GT - GT_EQ] == EJ_ja);
+
+    assert(genJCCinsLogical[GT_EQ - GT_EQ] == EJ_je);
+    assert(genJCCinsLogical[GT_NE - GT_EQ] == EJ_jne);
+    assert(genJCCinsLogical[GT_LT - GT_EQ] == EJ_js);
+    assert(genJCCinsLogical[GT_GE - GT_EQ] == EJ_jns);
+#elif defined(_TARGET_ARMARCH_)
+    assert(genJCCinsSigned[GT_EQ - GT_EQ] == EJ_eq);
+    assert(genJCCinsSigned[GT_NE - GT_EQ] == EJ_ne);
+    assert(genJCCinsSigned[GT_LT - GT_EQ] == EJ_lt);
+    assert(genJCCinsSigned[GT_LE - GT_EQ] == EJ_le);
+    assert(genJCCinsSigned[GT_GE - GT_EQ] == EJ_ge);
+    assert(genJCCinsSigned[GT_GT - GT_EQ] == EJ_gt);
+
+    assert(genJCCinsUnsigned[GT_EQ - GT_EQ] == EJ_eq);
+    assert(genJCCinsUnsigned[GT_NE - GT_EQ] == EJ_ne);
+    assert(genJCCinsUnsigned[GT_LT - GT_EQ] == EJ_lo);
+    assert(genJCCinsUnsigned[GT_LE - GT_EQ] == EJ_ls);
+    assert(genJCCinsUnsigned[GT_GE - GT_EQ] == EJ_hs);
+    assert(genJCCinsUnsigned[GT_GT - GT_EQ] == EJ_hi);
+
+    assert(genJCCinsLogical[GT_EQ - GT_EQ] == EJ_eq);
+    assert(genJCCinsLogical[GT_NE - GT_EQ] == EJ_ne);
+    assert(genJCCinsLogical[GT_LT - GT_EQ] == EJ_mi);
+    assert(genJCCinsLogical[GT_GE - GT_EQ] == EJ_pl);
+#else
+    assert(!"unknown arch");
+#endif
+    assert(GenTree::OperIsCompare(cmp));
+
+    emitJumpKind result = EJ_COUNT;
+
+    if (compareKind == CK_UNSIGNED)
+    {
+        result = (emitJumpKind)genJCCinsUnsigned[cmp - GT_EQ];
+    }
+    else if (compareKind == CK_SIGNED)
+    {
+        result = (emitJumpKind)genJCCinsSigned[cmp - GT_EQ];
+    }
+    else if (compareKind == CK_LOGICAL)
+    {
+        result = (emitJumpKind)genJCCinsLogical[cmp - GT_EQ];
+    }
+    assert(result != EJ_COUNT);
+    return result;
+}
+
+/*****************************************************************************
+ *
+ *  Generate an exit sequence for a return from a method (note: when compiling
+ *  for speed there might be multiple exit points).
+ */
+
+void CodeGen::genExitCode(BasicBlock* block)
+{
+#ifdef DEBUGGING_SUPPORT
+    /* Just wrote the first instruction of the epilog - inform debugger
+       Note that this may result in a duplicate IPmapping entry, and
+       that this is ok  */
+
+    // For non-optimized debuggable code, there is only one epilog.
+    genIPmappingAdd((IL_OFFSETX)ICorDebugInfo::EPILOG, true);
+#endif // DEBUGGING_SUPPORT
+
+    bool jmpEpilog = ((block->bbFlags & BBF_HAS_JMP) != 0);
+    if (compiler->getNeedsGSSecurityCookie())
+    {
+        genEmitGSCookieCheck(jmpEpilog);
+
+        if (jmpEpilog)
+        {
+            // Dev10 642944 -
+            // The GS cookie check created a temp label that has no live
+            // incoming GC registers, we need to fix that
+
+            unsigned   varNum;
+            LclVarDsc* varDsc;
+
+            /* Figure out which register parameters hold pointers */
+
+            for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->lvaCount && varDsc->lvIsRegArg;
+                 varNum++, varDsc++)
+            {
+                noway_assert(varDsc->lvIsParam);
+
+                gcInfo.gcMarkRegPtrVal(varDsc->lvArgReg, varDsc->TypeGet());
+            }
+
+            getEmitter()->emitThisGCrefRegs = getEmitter()->emitInitGCrefRegs = gcInfo.gcRegGCrefSetCur;
+            getEmitter()->emitThisByrefRegs = getEmitter()->emitInitByrefRegs = gcInfo.gcRegByrefSetCur;
+        }
+    }
+
+    genReserveEpilog(block);
+}
+
+/*****************************************************************************
+ *
+ * Generate code for an out-of-line exception.
+ * For debuggable code, we generate the 'throw' inline.
+ * For non-dbg code, we share the helper blocks created by fgAddCodeRef().
+ */
+
+void CodeGen::genJumpToThrowHlpBlk(emitJumpKind jumpKind, SpecialCodeKind codeKind, GenTreePtr failBlk)
+{
+    if (!compiler->opts.compDbgCode)
+    {
+        /* For non-debuggable code, find and use the helper block for
+           raising the exception. The block may be shared by other trees too. */
+
+        BasicBlock* tgtBlk;
+
+        if (failBlk)
+        {
+            /* We already know which block to jump to. Use that. */
+
+            noway_assert(failBlk->gtOper == GT_LABEL);
+            tgtBlk = failBlk->gtLabel.gtLabBB;
+            noway_assert(
+                tgtBlk ==
+                compiler->fgFindExcptnTarget(codeKind, compiler->bbThrowIndex(compiler->compCurBB))->acdDstBlk);
+        }
+        else
+        {
+            /* Find the helper-block which raises the exception. */
+
+            Compiler::AddCodeDsc* add =
+                compiler->fgFindExcptnTarget(codeKind, compiler->bbThrowIndex(compiler->compCurBB));
+            PREFIX_ASSUME_MSG((add != nullptr), ("ERROR: failed to find exception throw block"));
+            tgtBlk = add->acdDstBlk;
+        }
+
+        noway_assert(tgtBlk);
+
+        // Jump to the excption-throwing block on error.
+
+        inst_JMP(jumpKind, tgtBlk);
+    }
+    else
+    {
+        /* The code to throw the exception will be generated inline, and
+           we will jump around it in the normal non-exception case */
+
+        BasicBlock*  tgtBlk          = nullptr;
+        emitJumpKind reverseJumpKind = emitter::emitReverseJumpKind(jumpKind);
+        if (reverseJumpKind != jumpKind)
+        {
+            tgtBlk = genCreateTempLabel();
+            inst_JMP(reverseJumpKind, tgtBlk);
+        }
+
+        genEmitHelperCall(compiler->acdHelper(codeKind), 0, EA_UNKNOWN);
+
+        /* Define the spot for the normal non-exception case to jump to */
+        if (tgtBlk != nullptr)
+        {
+            assert(reverseJumpKind != jumpKind);
+            genDefineTempLabel(tgtBlk);
+        }
+    }
+}
+
+/*****************************************************************************
+ *
+ * The last operation done was generating code for "tree" and that would
+ * have set the flags. Check if the operation caused an overflow.
+ */
+
+// inline
+void CodeGen::genCheckOverflow(GenTreePtr tree)
+{
+    // Overflow-check should be asked for this tree
+    noway_assert(tree->gtOverflow());
+
+    const var_types type = tree->TypeGet();
+
+    // Overflow checks can only occur for the non-small types: (i.e. TYP_INT,TYP_LONG)
+    noway_assert(!varTypeIsSmall(type));
+
+    emitJumpKind jumpKind;
+
+#ifdef _TARGET_ARM64_
+    if (tree->OperGet() == GT_MUL)
+    {
+        jumpKind = EJ_ne;
+    }
+    else
+#endif
+    {
+        bool isUnsignedOverflow = ((tree->gtFlags & GTF_UNSIGNED) != 0);
+
+#if defined(_TARGET_XARCH_)
+
+        jumpKind = isUnsignedOverflow ? EJ_jb : EJ_jo;
+
+#elif defined(_TARGET_ARMARCH_)
+
+        jumpKind = isUnsignedOverflow ? EJ_lo : EJ_vs;
+
+        if (jumpKind == EJ_lo)
+        {
+            if ((tree->OperGet() != GT_SUB) && (tree->gtOper != GT_ASG_SUB))
+            {
+                jumpKind = EJ_hs;
+            }
+        }
+
+#endif // defined(_TARGET_ARMARCH_)
+    }
+
+    // Jump to the block which will throw the expection
+
+    genJumpToThrowHlpBlk(jumpKind, SCK_OVERFLOW);
+}
+
+#if FEATURE_EH_FUNCLETS
+
+/*****************************************************************************
+ *
+ *  Update the current funclet as needed by calling genUpdateCurrentFunclet().
+ *  For non-BBF_FUNCLET_BEG blocks, it asserts that the current funclet
+ *  is up-to-date.
+ *
+ */
+
+void CodeGen::genUpdateCurrentFunclet(BasicBlock* block)
+{
+    if (block->bbFlags & BBF_FUNCLET_BEG)
+    {
+        compiler->funSetCurrentFunc(compiler->funGetFuncIdx(block));
+        if (compiler->funCurrentFunc()->funKind == FUNC_FILTER)
+        {
+            assert(compiler->ehGetDsc(compiler->funCurrentFunc()->funEHIndex)->ebdFilter == block);
+        }
+        else
+        {
+            // We shouldn't see FUNC_ROOT
+            assert(compiler->funCurrentFunc()->funKind == FUNC_HANDLER);
+            assert(compiler->ehGetDsc(compiler->funCurrentFunc()->funEHIndex)->ebdHndBeg == block);
+        }
+    }
+    else
+    {
+        assert(compiler->compCurrFuncIdx <= compiler->compFuncInfoCount);
+        if (compiler->funCurrentFunc()->funKind == FUNC_FILTER)
+        {
+            assert(compiler->ehGetDsc(compiler->funCurrentFunc()->funEHIndex)->InFilterRegionBBRange(block));
+        }
+        else if (compiler->funCurrentFunc()->funKind == FUNC_ROOT)
+        {
+            assert(!block->hasHndIndex());
+        }
+        else
+        {
+            assert(compiler->funCurrentFunc()->funKind == FUNC_HANDLER);
+            assert(compiler->ehGetDsc(compiler->funCurrentFunc()->funEHIndex)->InHndRegionBBRange(block));
+        }
+    }
+}
+#endif // FEATURE_EH_FUNCLETS
+
+/*****************************************************************************
+ *
+ *  Generate code for the function.
+ */
+
+void CodeGen::genGenerateCode(void** codePtr, ULONG* nativeSizeOfCode)
+{
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** In genGenerateCode()\n");
+        compiler->fgDispBasicBlocks(compiler->verboseTrees);
+    }
+#endif
+
+    unsigned codeSize;
+    unsigned prologSize;
+    unsigned epilogSize;
+
+    void* consPtr;
+
+#ifdef DEBUG
+    genInterruptibleUsed = true;
+
+#if STACK_PROBES
+    genNeedPrologStackProbe = false;
+#endif
+
+    compiler->fgDebugCheckBBlist();
+#endif // DEBUG
+
+    /* This is the real thing */
+
+    genPrepForCompiler();
+
+    /* Prepare the emitter */
+    getEmitter()->Init();
+#ifdef DEBUG
+    VarSetOps::AssignNoCopy(compiler, genTempOldLife, VarSetOps::MakeEmpty(compiler));
+#endif
+
+#ifdef DEBUG
+    if (compiler->opts.disAsmSpilled && regSet.rsNeededSpillReg)
+    {
+        compiler->opts.disAsm = true;
+    }
+
+    if (compiler->opts.disAsm)
+    {
+        printf("; Assembly listing for method %s\n", compiler->info.compFullName);
+
+        printf("; Emitting ");
+
+        if (compiler->compCodeOpt() == Compiler::SMALL_CODE)
+        {
+            printf("SMALL_CODE");
+        }
+        else if (compiler->compCodeOpt() == Compiler::FAST_CODE)
+        {
+            printf("FAST_CODE");
+        }
+        else
+        {
+            printf("BLENDED_CODE");
+        }
+
+        printf(" for ");
+
+        if (compiler->info.genCPU == CPU_X86)
+        {
+            printf("generic X86 CPU");
+        }
+        else if (compiler->info.genCPU == CPU_X86_PENTIUM_4)
+        {
+            printf("Pentium 4");
+        }
+        else if (compiler->info.genCPU == CPU_X64)
+        {
+            if (compiler->canUseAVX())
+            {
+                printf("X64 CPU with AVX");
+            }
+            else
+            {
+                printf("X64 CPU with SSE2");
+            }
+        }
+
+        else if (compiler->info.genCPU == CPU_ARM)
+        {
+            printf("generic ARM CPU");
+        }
+
+        printf("\n");
+
+        if ((compiler->opts.compFlags & CLFLG_MAXOPT) == CLFLG_MAXOPT)
+        {
+            printf("; optimized code\n");
+        }
+        else if (compiler->opts.compDbgCode)
+        {
+            printf("; debuggable code\n");
+        }
+        else if (compiler->opts.MinOpts())
+        {
+            printf("; compiler->opts.MinOpts() is true\n");
+        }
+        else
+        {
+            printf("; unknown optimization flags\n");
+        }
+
+#if DOUBLE_ALIGN
+        if (compiler->genDoubleAlign())
+            printf("; double-aligned frame\n");
+        else
+#endif
+            printf("; %s based frame\n", isFramePointerUsed() ? STR_FPBASE : STR_SPBASE);
+
+        if (genInterruptible)
+        {
+            printf("; fully interruptible\n");
+        }
+        else
+        {
+            printf("; partially interruptible\n");
+        }
+
+        if (compiler->fgHaveProfileData())
+        {
+            printf("; with IBC profile data\n");
+        }
+
+        if (compiler->fgProfileData_ILSizeMismatch)
+        {
+            printf("; discarded IBC profile data due to mismatch in ILSize\n");
+        }
+    }
+#endif // DEBUG
+
+#ifndef LEGACY_BACKEND
+
+    // For RyuJIT backend, we compute the final frame layout before code generation. This is because LSRA
+    // has already computed exactly the maximum concurrent number of spill temps of each type that are
+    // required during code generation. So, there is nothing left to estimate: we can be precise in the frame
+    // layout. This helps us generate smaller code, and allocate, after code generation, a smaller amount of
+    // memory from the VM.
+
+    genFinalizeFrame();
+
+    unsigned maxTmpSize = compiler->tmpSize; // This is precise after LSRA has pre-allocated the temps.
+
+#else // LEGACY_BACKEND
+
+    // Estimate the frame size: first, estimate the number of spill temps needed by taking the register
+    // predictor spill temp estimates and stress levels into consideration. Then, compute the tentative
+    // frame layout using conservative callee-save register estimation (namely, guess they'll all be used
+    // and thus saved on the frame).
+
+    // Compute the maximum estimated spill temp size.
+    unsigned maxTmpSize = sizeof(double) + sizeof(float) + sizeof(__int64) + sizeof(void*);
+
+    maxTmpSize += (compiler->tmpDoubleSpillMax * sizeof(double)) + (compiler->tmpIntSpillMax * sizeof(int));
+
+#ifdef DEBUG
+
+    /* When StressRegs is >=1, there will be a bunch of spills not predicted by
+       the predictor (see logic in rsPickReg).  It will be very hard to teach
+       the predictor about the behavior of rsPickReg for StressRegs >= 1, so
+       instead let's make maxTmpSize large enough so that we won't be wrong.
+       This means that at StressRegs >= 1, we will not be testing the logic
+       that sets the maxTmpSize size.
+    */
+
+    if (regSet.rsStressRegs() >= 1)
+    {
+        maxTmpSize += (REG_TMP_ORDER_COUNT * REGSIZE_BYTES);
+    }
+
+    // JIT uses 2 passes when assigning stack variable (i.e. args, temps, and locals) locations in varDsc->lvStkOffs.
+    // During the 1st pass (in genGenerateCode), it estimates the maximum possible size for stack temps
+    // and put it in maxTmpSize. Then it calculates the varDsc->lvStkOffs for each variable based on this estimation.
+    // However during stress mode, we might spill more temps on the stack, which might grow the
+    // size of the temp area.
+    // This might cause varDsc->lvStkOffs to change during the 2nd pass (in emitEndCodeGen).
+    // If the change of varDsc->lvStkOffs crosses the threshold for the instruction size,
+    // we will then have a mismatched estimated code size (during the 1st pass) and the actual emitted code size
+    // (during the 2nd pass).
+    // Also, if STRESS_UNSAFE_BUFFER_CHECKS is turned on, we might reorder the stack variable locations,
+    // which could cause the mismatch too.
+    //
+    // The following code is simply bump the maxTmpSize up to at least BYTE_MAX+1 during the stress mode, so that
+    // we don't run into code size problem during stress.
+
+    if (getJitStressLevel() != 0)
+    {
+        if (maxTmpSize < BYTE_MAX + 1)
+        {
+            maxTmpSize = BYTE_MAX + 1;
+        }
+    }
+#endif // DEBUG
+
+    /* Estimate the offsets of locals/arguments and size of frame */
+
+    unsigned lclSize = compiler->lvaFrameSize(Compiler::TENTATIVE_FRAME_LAYOUT);
+
+#ifdef DEBUG
+    //
+    // Display the local frame offsets that we have tentatively decided upon
+    //
+    if (verbose)
+    {
+        compiler->lvaTableDump();
+    }
+#endif // DEBUG
+
+#endif // LEGACY_BACKEND
+
+    getEmitter()->emitBegFN(isFramePointerUsed()
+#if defined(DEBUG)
+                                ,
+                            (compiler->compCodeOpt() != Compiler::SMALL_CODE) &&
+                                !(compiler->opts.eeFlags & CORJIT_FLG_PREJIT)
+#endif
+#ifdef LEGACY_BACKEND
+                                ,
+                            lclSize
+#endif // LEGACY_BACKEND
+                            ,
+                            maxTmpSize);
+
+    /* Now generate code for the function */
+    genCodeForBBlist();
+
+#ifndef LEGACY_BACKEND
+#ifdef DEBUG
+    // After code generation, dump the frame layout again. It should be the same as before code generation, if code
+    // generation hasn't touched it (it shouldn't!).
+    if (verbose)
+    {
+        compiler->lvaTableDump();
+    }
+#endif // DEBUG
+#endif // !LEGACY_BACKEND
+
+    /* We can now generate the function prolog and epilog */
+
+    genGeneratePrologsAndEpilogs();
+
+    /* Bind jump distances */
+
+    getEmitter()->emitJumpDistBind();
+
+    /* The code is now complete and final; it should not change after this. */
+
+    /* Compute the size of the code sections that we are going to ask the VM
+       to allocate. Note that this might not be precisely the size of the
+       code we emit, though it's fatal if we emit more code than the size we
+       compute here.
+       (Note: an example of a case where we emit less code would be useful.)
+    */
+
+    getEmitter()->emitComputeCodeSizes();
+
+#ifdef DEBUG
+
+    // Code to test or stress our ability to run a fallback compile.
+    // We trigger the fallback here, before asking the VM for any memory,
+    // because if not, we will leak mem, as the current codebase can't free
+    // the mem after the emitter asks the VM for it. As this is only a stress
+    // mode, we only want the functionality, and don't care about the relative
+    // ugliness of having the failure here.
+    if (!compiler->jitFallbackCompile)
+    {
+        // Use COMPlus_JitNoForceFallback=1 to prevent NOWAY assert testing from happening,
+        // especially that caused by enabling JIT stress.
+        if (!JitConfig.JitNoForceFallback())
+        {
+            if (JitConfig.JitForceFallback() || compiler->compStressCompile(Compiler::STRESS_GENERIC_VARN, 5))
+            {
+                NO_WAY_NOASSERT("Stress failure");
+            }
+        }
+    }
+
+#endif // DEBUG
+
+    /* We've finished collecting all the unwind information for the function. Now reserve
+       space for it from the VM.
+    */
+
+    compiler->unwindReserve();
+
+#if DISPLAY_SIZES
+
+    size_t dataSize = getEmitter()->emitDataSize();
+
+#endif // DISPLAY_SIZES
+
+    void* coldCodePtr;
+
+    bool trackedStackPtrsContig; // are tracked stk-ptrs contiguous ?
+
+#ifdef _TARGET_AMD64_
+    trackedStackPtrsContig = false;
+#elif defined(_TARGET_ARM_)
+    // On arm due to prespilling of arguments, tracked stk-ptrs may not be contiguous
+    trackedStackPtrsContig = !compiler->opts.compDbgEnC && !compiler->compIsProfilerHookNeeded();
+#elif defined(_TARGET_ARM64_)
+    // Incoming vararg registers are homed on the top of the stack. Tracked var may not be contiguous.
+    trackedStackPtrsContig = !compiler->opts.compDbgEnC && !compiler->info.compIsVarArgs;
+#else
+    trackedStackPtrsContig = !compiler->opts.compDbgEnC;
+#endif
+
+#ifdef DEBUG
+    /* We're done generating code for this function */
+    compiler->compCodeGenDone = true;
+#endif
+
+    compiler->EndPhase(PHASE_GENERATE_CODE);
+
+    codeSize = getEmitter()->emitEndCodeGen(compiler, trackedStackPtrsContig, genInterruptible, genFullPtrRegMap,
+                                            (compiler->info.compRetType == TYP_REF), compiler->compHndBBtabCount,
+                                            &prologSize, &epilogSize, codePtr, &coldCodePtr, &consPtr);
+
+    compiler->EndPhase(PHASE_EMIT_CODE);
+
+#ifdef DEBUG
+    if (compiler->opts.disAsm)
+    {
+        printf("; Total bytes of code %d, prolog size %d for method %s\n", codeSize, prologSize,
+               compiler->info.compFullName);
+        printf("; ============================================================\n");
+        printf(""); // in our logic this causes a flush
+    }
+
+    if (verbose)
+    {
+        printf("*************** After end code gen, before unwindEmit()\n");
+        getEmitter()->emitDispIGlist(true);
+    }
+#endif
+
+#if EMIT_TRACK_STACK_DEPTH
+    /* Check our max stack level. Needed for fgAddCodeRef().
+       We need to relax the assert as our estimation won't include code-gen
+       stack changes (which we know don't affect fgAddCodeRef()) */
+    noway_assert(getEmitter()->emitMaxStackDepth <=
+                 (compiler->fgPtrArgCntMax + compiler->compHndBBtabCount + // Return address for locally-called finallys
+                  genTypeStSz(TYP_LONG) +                 // longs/doubles may be transferred via stack, etc
+                  (compiler->compTailCallUsed ? 4 : 0))); // CORINFO_HELP_TAILCALL args
+#endif
+
+    *nativeSizeOfCode                 = codeSize;
+    compiler->info.compNativeCodeSize = (UNATIVE_OFFSET)codeSize;
+
+    // printf("%6u bytes of code generated for %s.%s\n", codeSize, compiler->info.compFullName);
+
+    // Make sure that the x86 alignment and cache prefetch optimization rules
+    // were obeyed.
+
+    // Don't start a method in the last 7 bytes of a 16-byte alignment area
+    //   unless we are generating SMALL_CODE
+    // noway_assert( (((unsigned)(*codePtr) % 16) <= 8) || (compiler->compCodeOpt() == SMALL_CODE));
+
+    /* Now that the code is issued, we can finalize and emit the unwind data */
+
+    compiler->unwindEmit(*codePtr, coldCodePtr);
+
+#ifdef DEBUGGING_SUPPORT
+
+    /* Finalize the line # tracking logic after we know the exact block sizes/offsets */
+
+    genIPmappingGen();
+
+    /* Finalize the Local Var info in terms of generated code */
+
+    genSetScopeInfo();
+
+#endif // DEBUGGING_SUPPORT
+
+#ifdef LATE_DISASM
+    unsigned finalHotCodeSize;
+    unsigned finalColdCodeSize;
+    if (compiler->fgFirstColdBlock != nullptr)
+    {
+        // We did some hot/cold splitting. The hot section is always padded out to the
+        // size we thought it would be, but the cold section is not.
+        assert(codeSize <= compiler->info.compTotalHotCodeSize + compiler->info.compTotalColdCodeSize);
+        assert(compiler->info.compTotalHotCodeSize > 0);
+        assert(compiler->info.compTotalColdCodeSize > 0);
+        finalHotCodeSize  = compiler->info.compTotalHotCodeSize;
+        finalColdCodeSize = codeSize - finalHotCodeSize;
+    }
+    else
+    {
+        // No hot/cold splitting
+        assert(codeSize <= compiler->info.compTotalHotCodeSize);
+        assert(compiler->info.compTotalHotCodeSize > 0);
+        assert(compiler->info.compTotalColdCodeSize == 0);
+        finalHotCodeSize  = codeSize;
+        finalColdCodeSize = 0;
+    }
+    getDisAssembler().disAsmCode((BYTE*)*codePtr, finalHotCodeSize, (BYTE*)coldCodePtr, finalColdCodeSize);
+#endif // LATE_DISASM
+
+    /* Report any exception handlers to the VM */
+
+    genReportEH();
+
+#ifdef JIT32_GCENCODER
+#ifdef DEBUG
+    void* infoPtr =
+#endif // DEBUG
+#endif
+        // Create and store the GC info for this method.
+        genCreateAndStoreGCInfo(codeSize, prologSize, epilogSize DEBUGARG(codePtr));
+
+#ifdef DEBUG
+    FILE* dmpf = jitstdout;
+
+    compiler->opts.dmpHex = false;
+    if (!strcmp(compiler->info.compMethodName, "<name of method you want the hex dump for"))
+    {
+        FILE*   codf;
+        errno_t ec = fopen_s(&codf, "C:\\JIT.COD", "at"); // NOTE: file append mode
+        if (ec != 0)
+        {
+            assert(codf);
+            dmpf                  = codf;
+            compiler->opts.dmpHex = true;
+        }
+    }
+    if (compiler->opts.dmpHex)
+    {
+        size_t consSize = getEmitter()->emitDataSize();
+        size_t infoSize = compiler->compInfoBlkSize;
+
+        fprintf(dmpf, "Generated code for %s:\n", compiler->info.compFullName);
+        fprintf(dmpf, "\n");
+
+        if (codeSize)
+        {
+            fprintf(dmpf, "    Code  at %p [%04X bytes]\n", dspPtr(*codePtr), codeSize);
+        }
+        if (consSize)
+        {
+            fprintf(dmpf, "    Const at %p [%04X bytes]\n", dspPtr(consPtr), consSize);
+        }
+#ifdef JIT32_GCENCODER
+        if (infoSize)
+            fprintf(dmpf, "    Info  at %p [%04X bytes]\n", dspPtr(infoPtr), infoSize);
+#endif // JIT32_GCENCODER
+
+        fprintf(dmpf, "\n");
+
+        if (codeSize)
+        {
+            hexDump(dmpf, "Code", (BYTE*)*codePtr, codeSize);
+        }
+        if (consSize)
+        {
+            hexDump(dmpf, "Const", (BYTE*)consPtr, consSize);
+        }
+#ifdef JIT32_GCENCODER
+        if (infoSize)
+            hexDump(dmpf, "Info", (BYTE*)infoPtr, infoSize);
+#endif // JIT32_GCENCODER
+
+        fflush(dmpf);
+    }
+
+    if (dmpf != jitstdout)
+    {
+        fclose(dmpf);
+    }
+
+#endif // DEBUG
+
+    /* Tell the emitter that we're done with this function */
+
+    getEmitter()->emitEndFN();
+
+    /* Shut down the spill logic */
+
+    regSet.rsSpillDone();
+
+    /* Shut down the temp logic */
+
+    compiler->tmpDone();
+
+#if DISPLAY_SIZES
+
+    grossVMsize += compiler->info.compILCodeSize;
+    totalNCsize += codeSize + dataSize + compiler->compInfoBlkSize;
+    grossNCsize += codeSize + dataSize;
+
+#endif // DISPLAY_SIZES
+
+    compiler->EndPhase(PHASE_EMIT_GCEH);
+}
+
+/*****************************************************************************
+ *
+ *  Report EH clauses to the VM
+ */
+
+void CodeGen::genReportEH()
+{
+    if (compiler->compHndBBtabCount == 0)
+    {
+        return;
+    }
+
+#ifdef DEBUG
+    if (compiler->opts.dspEHTable)
+    {
+        printf("*************** EH table for %s\n", compiler->info.compFullName);
+    }
+#endif // DEBUG
+
+    unsigned  XTnum;
+    EHblkDsc* HBtab;
+    EHblkDsc* HBtabEnd;
+
+    unsigned EHCount = compiler->compHndBBtabCount;
+
+#if FEATURE_EH_FUNCLETS
+    // Count duplicated clauses. This uses the same logic as below, where we actually generate them for reporting to the
+    // VM.
+    unsigned duplicateClauseCount = 0;
+    unsigned enclosingTryIndex;
+    for (XTnum = 0; XTnum < compiler->compHndBBtabCount; XTnum++)
+    {
+        for (enclosingTryIndex = compiler->ehTrueEnclosingTryIndexIL(XTnum); // find the true enclosing try index,
+                                                                             // ignoring 'mutual protect' trys
+             enclosingTryIndex != EHblkDsc::NO_ENCLOSING_INDEX;
+             enclosingTryIndex = compiler->ehGetEnclosingTryIndex(enclosingTryIndex))
+        {
+            ++duplicateClauseCount;
+        }
+    }
+    EHCount += duplicateClauseCount;
+
+#if FEATURE_EH_CALLFINALLY_THUNKS
+    unsigned clonedFinallyCount = 0;
+
+    // We don't keep track of how many cloned finally there are. So, go through and count.
+    // We do a quick pass first through the EH table to see if there are any try/finally
+    // clauses. If there aren't, we don't need to look for BBJ_CALLFINALLY.
+
+    bool anyFinallys = false;
+    for (HBtab = compiler->compHndBBtab, HBtabEnd = compiler->compHndBBtab + compiler->compHndBBtabCount;
+         HBtab < HBtabEnd; HBtab++)
+    {
+        if (HBtab->HasFinallyHandler())
+        {
+            anyFinallys = true;
+            break;
+        }
+    }
+    if (anyFinallys)
+    {
+        for (BasicBlock* block = compiler->fgFirstBB; block != nullptr; block = block->bbNext)
+        {
+            if (block->bbJumpKind == BBJ_CALLFINALLY)
+            {
+                ++clonedFinallyCount;
+            }
+        }
+
+        EHCount += clonedFinallyCount;
+    }
+#endif // FEATURE_EH_CALLFINALLY_THUNKS
+
+#endif // FEATURE_EH_FUNCLETS
+
+#ifdef DEBUG
+    if (compiler->opts.dspEHTable)
+    {
+#if FEATURE_EH_FUNCLETS
+#if FEATURE_EH_CALLFINALLY_THUNKS
+        printf("%d EH table entries, %d duplicate clauses, %d cloned finallys, %d total EH entries reported to VM\n",
+               compiler->compHndBBtabCount, duplicateClauseCount, clonedFinallyCount, EHCount);
+        assert(compiler->compHndBBtabCount + duplicateClauseCount + clonedFinallyCount == EHCount);
+#else  // !FEATURE_EH_CALLFINALLY_THUNKS
+        printf("%d EH table entries, %d duplicate clauses, %d total EH entries reported to VM\n",
+               compiler->compHndBBtabCount, duplicateClauseCount, EHCount);
+        assert(compiler->compHndBBtabCount + duplicateClauseCount == EHCount);
+#endif // !FEATURE_EH_CALLFINALLY_THUNKS
+#else  // !FEATURE_EH_FUNCLETS
+        printf("%d EH table entries, %d total EH entries reported to VM\n", compiler->compHndBBtabCount, EHCount);
+        assert(compiler->compHndBBtabCount == EHCount);
+#endif // !FEATURE_EH_FUNCLETS
+    }
+#endif // DEBUG
+
+    // Tell the VM how many EH clauses to expect.
+    compiler->eeSetEHcount(EHCount);
+
+    XTnum = 0; // This is the index we pass to the VM
+
+    for (HBtab = compiler->compHndBBtab, HBtabEnd = compiler->compHndBBtab + compiler->compHndBBtabCount;
+         HBtab < HBtabEnd; HBtab++)
+    {
+        UNATIVE_OFFSET tryBeg, tryEnd, hndBeg, hndEnd, hndTyp;
+
+        tryBeg = compiler->ehCodeOffset(HBtab->ebdTryBeg);
+        hndBeg = compiler->ehCodeOffset(HBtab->ebdHndBeg);
+
+        tryEnd = (HBtab->ebdTryLast == compiler->fgLastBB) ? compiler->info.compNativeCodeSize
+                                                           : compiler->ehCodeOffset(HBtab->ebdTryLast->bbNext);
+        hndEnd = (HBtab->ebdHndLast == compiler->fgLastBB) ? compiler->info.compNativeCodeSize
+                                                           : compiler->ehCodeOffset(HBtab->ebdHndLast->bbNext);
+
+        if (HBtab->HasFilter())
+        {
+            hndTyp = compiler->ehCodeOffset(HBtab->ebdFilter);
+        }
+        else
+        {
+            hndTyp = HBtab->ebdTyp;
+        }
+
+        CORINFO_EH_CLAUSE_FLAGS flags = ToCORINFO_EH_CLAUSE_FLAGS(HBtab->ebdHandlerType);
+
+        // Note that we reuse the CORINFO_EH_CLAUSE type, even though the names of
+        // the fields aren't accurate.
+
+        CORINFO_EH_CLAUSE clause;
+        clause.ClassToken    = hndTyp; /* filter offset is passed back here for filter-based exception handlers */
+        clause.Flags         = flags;
+        clause.TryOffset     = tryBeg;
+        clause.TryLength     = tryEnd;
+        clause.HandlerOffset = hndBeg;
+        clause.HandlerLength = hndEnd;
+
+        assert(XTnum < EHCount);
+
+        // Tell the VM about this EH clause.
+        compiler->eeSetEHinfo(XTnum, &clause);
+
+        ++XTnum;
+    }
+
+#if FEATURE_EH_FUNCLETS
+    // Now output duplicated clauses.
+    //
+    // If a funclet has been created by moving a handler out of a try region that it was originally nested
+    // within, then we need to report a "duplicate" clause representing the fact that an exception in that
+    // handler can be caught by the 'try' it has been moved out of. This is because the original 'try' region
+    // descriptor can only specify a single, contiguous protected range, but the funclet we've moved out is
+    // no longer contiguous with the original 'try' region. The new EH descriptor will have the same handler
+    // region as the enclosing try region's handler region. This is the sense in which it is duplicated:
+    // there is now a "duplicate" clause with the same handler region as another, but a different 'try'
+    // region.
+    //
+    // For example, consider this (capital letters represent an unknown code sequence, numbers identify a
+    // try or handler region):
+    //
+    // A
+    // try (1) {
+    //   B
+    //   try (2) {
+    //     C
+    //   } catch (3) {
+    //     D
+    //   } catch (4) {
+    //     E
+    //   }
+    //   F
+    // } catch (5) {
+    //   G
+    // }
+    // H
+    //
+    // Here, we have try region (1) BCDEF protected by catch (5) G, and region (2) C protected
+    // by catch (3) D and catch (4) E. Note that catch (4) E does *NOT* protect the code "D".
+    // This is an example of 'mutually protect' regions. First, we move handlers (3) and (4)
+    // to the end of the code. However, (3) and (4) are nested inside, and protected by, try (1). Again
+    // note that (3) is not nested inside (4), despite ebdEnclosingTryIndex indicating that.
+    // The code "D" and "E" won't be contiguous with the protected region for try (1) (which
+    // will, after moving catch (3) AND (4), be BCF). Thus, we need to add a new EH descriptor
+    // representing try (1) protecting the new funclets catch (3) and (4).
+    // The code will be generated as follows:
+    //
+    // ABCFH // "main" code
+    // D // funclet
+    // E // funclet
+    // G // funclet
+    //
+    // The EH regions are:
+    //
+    //  C -> D
+    //  C -> E
+    //  BCF -> G
+    //  D -> G // "duplicate" clause
+    //  E -> G // "duplicate" clause
+    //
+    // Note that we actually need to generate one of these additional "duplicate" clauses for every
+    // region the funclet is nested in. Take this example:
+    //
+    //  A
+    //  try (1) {
+    //      B
+    //      try (2,3) {
+    //          C
+    //          try (4) {
+    //              D
+    //              try (5,6) {
+    //                  E
+    //              } catch {
+    //                  F
+    //              } catch {
+    //                  G
+    //              }
+    //              H
+    //          } catch {
+    //              I
+    //          }
+    //          J
+    //      } catch {
+    //          K
+    //      } catch {
+    //          L
+    //      }
+    //      M
+    //  } catch {
+    //      N
+    //  }
+    //  O
+    //
+    // When we pull out funclets, we get the following generated code:
+    //
+    // ABCDEHJMO // "main" function
+    // F // funclet
+    // G // funclet
+    // I // funclet
+    // K // funclet
+    // L // funclet
+    // N // funclet
+    //
+    // And the EH regions we report to the VM are (in order; main clauses
+    // first in most-to-least nested order, funclets ("duplicated clauses")
+    // last, in most-to-least nested) are:
+    //
+    //  E -> F
+    //  E -> G
+    //  DEH -> I
+    //  CDEHJ -> K
+    //  CDEHJ -> L
+    //  BCDEHJM -> N
+    //  F -> I // funclet clause #1 for F
+    //  F -> K // funclet clause #2 for F
+    //  F -> L // funclet clause #3 for F
+    //  F -> N // funclet clause #4 for F
+    //  G -> I // funclet clause #1 for G
+    //  G -> K // funclet clause #2 for G
+    //  G -> L // funclet clause #3 for G
+    //  G -> N // funclet clause #4 for G
+    //  I -> K // funclet clause #1 for I
+    //  I -> L // funclet clause #2 for I
+    //  I -> N // funclet clause #3 for I
+    //  K -> N // funclet clause #1 for K
+    //  L -> N // funclet clause #1 for L
+    //
+    // So whereas the IL had 6 EH clauses, we need to report 19 EH clauses to the VM.
+    // Note that due to the nature of 'mutually protect' clauses, it would be incorrect
+    // to add a clause "F -> G" because F is NOT protected by G, but we still have
+    // both "F -> K" and "F -> L" because F IS protected by both of those handlers.
+    //
+    // The overall ordering of the clauses is still the same most-to-least nesting
+    // after front-to-back start offset. Because we place the funclets at the end
+    // these new clauses should also go at the end by this ordering.
+    //
+
+    if (duplicateClauseCount > 0)
+    {
+        unsigned reportedDuplicateClauseCount = 0; // How many duplicated clauses have we reported?
+        unsigned XTnum2;
+        for (XTnum2 = 0, HBtab = compiler->compHndBBtab; XTnum2 < compiler->compHndBBtabCount; XTnum2++, HBtab++)
+        {
+            unsigned enclosingTryIndex;
+
+            EHblkDsc* fletTab = compiler->ehGetDsc(XTnum2);
+
+            for (enclosingTryIndex = compiler->ehTrueEnclosingTryIndexIL(XTnum2); // find the true enclosing try index,
+                                                                                  // ignoring 'mutual protect' trys
+                 enclosingTryIndex != EHblkDsc::NO_ENCLOSING_INDEX;
+                 enclosingTryIndex = compiler->ehGetEnclosingTryIndex(enclosingTryIndex))
+            {
+                // The funclet we moved out is nested in a try region, so create a new EH descriptor for the funclet
+                // that will have the enclosing try protecting the funclet.
+
+                noway_assert(XTnum2 < enclosingTryIndex); // the enclosing region must be less nested, and hence have a
+                                                          // greater EH table index
+
+                EHblkDsc* encTab = compiler->ehGetDsc(enclosingTryIndex);
+
+                // The try region is the handler of the funclet. Note that for filters, we don't protect the
+                // filter region, only the filter handler region. This is because exceptions in filters never
+                // escape; the VM swallows them.
+
+                BasicBlock* bbTryBeg  = fletTab->ebdHndBeg;
+                BasicBlock* bbTryLast = fletTab->ebdHndLast;
+
+                BasicBlock* bbHndBeg  = encTab->ebdHndBeg; // The handler region is the same as the enclosing try
+                BasicBlock* bbHndLast = encTab->ebdHndLast;
+
+                UNATIVE_OFFSET tryBeg, tryEnd, hndBeg, hndEnd, hndTyp;
+
+                tryBeg = compiler->ehCodeOffset(bbTryBeg);
+                hndBeg = compiler->ehCodeOffset(bbHndBeg);
+
+                tryEnd = (bbTryLast == compiler->fgLastBB) ? compiler->info.compNativeCodeSize
+                                                           : compiler->ehCodeOffset(bbTryLast->bbNext);
+                hndEnd = (bbHndLast == compiler->fgLastBB) ? compiler->info.compNativeCodeSize
+                                                           : compiler->ehCodeOffset(bbHndLast->bbNext);
+
+                if (encTab->HasFilter())
+                {
+                    hndTyp = compiler->ehCodeOffset(encTab->ebdFilter);
+                }
+                else
+                {
+                    hndTyp = encTab->ebdTyp;
+                }
+
+                CORINFO_EH_CLAUSE_FLAGS flags = ToCORINFO_EH_CLAUSE_FLAGS(encTab->ebdHandlerType);
+
+                // Tell the VM this is an extra clause caused by moving funclets out of line.
+                // It seems weird this is from the CorExceptionFlag enum in corhdr.h,
+                // not the CORINFO_EH_CLAUSE_FLAGS enum in corinfo.h.
+                flags = (CORINFO_EH_CLAUSE_FLAGS)(flags | COR_ILEXCEPTION_CLAUSE_DUPLICATED);
+
+                // Note that the JIT-EE interface reuses the CORINFO_EH_CLAUSE type, even though the names of
+                // the fields aren't really accurate. For example, we set "TryLength" to the offset of the
+                // instruction immediately after the 'try' body. So, it really could be more accurately named
+                // "TryEndOffset".
+
+                CORINFO_EH_CLAUSE clause;
+                clause.ClassToken = hndTyp; /* filter offset is passed back here for filter-based exception handlers */
+                clause.Flags      = flags;
+                clause.TryOffset  = tryBeg;
+                clause.TryLength  = tryEnd;
+                clause.HandlerOffset = hndBeg;
+                clause.HandlerLength = hndEnd;
+
+                assert(XTnum < EHCount);
+
+                // Tell the VM about this EH clause (a duplicated clause).
+                compiler->eeSetEHinfo(XTnum, &clause);
+
+                ++XTnum;
+                ++reportedDuplicateClauseCount;
+
+#ifndef DEBUG
+                if (duplicateClauseCount == reportedDuplicateClauseCount)
+                {
+                    break; // we've reported all of them; no need to continue looking
+                }
+#endif // !DEBUG
+
+            } // for each 'true' enclosing 'try'
+        }     // for each EH table entry
+
+        assert(duplicateClauseCount == reportedDuplicateClauseCount);
+    } // if (duplicateClauseCount > 0)
+
+#if FEATURE_EH_CALLFINALLY_THUNKS
+    if (anyFinallys)
+    {
+        unsigned reportedClonedFinallyCount = 0;
+        for (BasicBlock* block = compiler->fgFirstBB; block != nullptr; block = block->bbNext)
+        {
+            if (block->bbJumpKind == BBJ_CALLFINALLY)
+            {
+                UNATIVE_OFFSET hndBeg, hndEnd;
+
+                hndBeg = compiler->ehCodeOffset(block);
+
+                // How big is it? The BBJ_ALWAYS has a null bbEmitCookie! Look for the block after, which must be
+                // a label or jump target, since the BBJ_CALLFINALLY doesn't fall through.
+                BasicBlock* bbLabel = block->bbNext;
+                if (block->isBBCallAlwaysPair())
+                {
+                    bbLabel = bbLabel->bbNext; // skip the BBJ_ALWAYS
+                }
+                if (bbLabel == nullptr)
+                {
+                    hndEnd = compiler->info.compNativeCodeSize;
+                }
+                else
+                {
+                    assert(bbLabel->bbEmitCookie != nullptr);
+                    hndEnd = compiler->ehCodeOffset(bbLabel);
+                }
+
+                CORINFO_EH_CLAUSE clause;
+                clause.ClassToken = 0; // unused
+                clause.Flags = (CORINFO_EH_CLAUSE_FLAGS)(CORINFO_EH_CLAUSE_FINALLY | COR_ILEXCEPTION_CLAUSE_DUPLICATED);
+                clause.TryOffset     = hndBeg;
+                clause.TryLength     = hndBeg;
+                clause.HandlerOffset = hndBeg;
+                clause.HandlerLength = hndEnd;
+
+                assert(XTnum < EHCount);
+
+                // Tell the VM about this EH clause (a cloned finally clause).
+                compiler->eeSetEHinfo(XTnum, &clause);
+
+                ++XTnum;
+                ++reportedClonedFinallyCount;
+
+#ifndef DEBUG
+                if (clonedFinallyCount == reportedClonedFinallyCount)
+                {
+                    break; // we're done; no need to keep looking
+                }
+#endif        // !DEBUG
+            } // block is BBJ_CALLFINALLY
+        }     // for each block
+
+        assert(clonedFinallyCount == reportedClonedFinallyCount);
+    }  // if (anyFinallys)
+#endif // FEATURE_EH_CALLFINALLY_THUNKS
+
+#endif // FEATURE_EH_FUNCLETS
+
+    assert(XTnum == EHCount);
+}
+
+void CodeGen::genGCWriteBarrier(GenTreePtr tgt, GCInfo::WriteBarrierForm wbf)
+{
+#ifndef LEGACY_BACKEND
+    noway_assert(tgt->gtOper == GT_STOREIND);
+#else  // LEGACY_BACKEND
+    noway_assert(tgt->gtOper == GT_IND || tgt->gtOper == GT_CLS_VAR); // enforced by gcIsWriteBarrierCandidate
+#endif // LEGACY_BACKEND
+
+    /* Call the proper vm helper */
+    int helper = CORINFO_HELP_ASSIGN_REF;
+#ifdef DEBUG
+    if (wbf == GCInfo::WBF_NoBarrier_CheckNotHeapInDebug)
+    {
+        helper = CORINFO_HELP_ASSIGN_REF_ENSURE_NONHEAP;
+    }
+    else
+#endif
+        if (tgt->gtOper != GT_CLS_VAR)
+    {
+        if (wbf != GCInfo::WBF_BarrierUnchecked) // This overrides the tests below.
+        {
+            if (tgt->gtFlags & GTF_IND_TGTANYWHERE)
+            {
+                helper = CORINFO_HELP_CHECKED_ASSIGN_REF;
+            }
+            else if (tgt->gtOp.gtOp1->TypeGet() == TYP_I_IMPL)
+            {
+                helper = CORINFO_HELP_CHECKED_ASSIGN_REF;
+            }
+        }
+    }
+    assert(((helper == CORINFO_HELP_ASSIGN_REF_ENSURE_NONHEAP) && (wbf == GCInfo::WBF_NoBarrier_CheckNotHeapInDebug)) ||
+           ((helper == CORINFO_HELP_CHECKED_ASSIGN_REF) &&
+            (wbf == GCInfo::WBF_BarrierChecked || wbf == GCInfo::WBF_BarrierUnknown)) ||
+           ((helper == CORINFO_HELP_ASSIGN_REF) &&
+            (wbf == GCInfo::WBF_BarrierUnchecked || wbf == GCInfo::WBF_BarrierUnknown)));
+
+#ifdef FEATURE_COUNT_GC_WRITE_BARRIERS
+    // We classify the "tgt" trees as follows:
+    // If "tgt" is of the form (where [ x ] indicates an optional x, and { x1, ..., xn } means "one of the x_i forms"):
+    //    IND [-> ADDR -> IND] -> { GT_LCL_VAR, GT_REG_VAR, ADD({GT_LCL_VAR, GT_REG_VAR}, X), ADD(X, (GT_LCL_VAR,
+    //    GT_REG_VAR)) }
+    // then let "v" be the GT_LCL_VAR or GT_REG_VAR.
+    //   * If "v" is the return buffer argument, classify as CWBKind_RetBuf.
+    //   * If "v" is another by-ref argument, classify as CWBKind_ByRefArg.
+    //   * Otherwise, classify as CWBKind_OtherByRefLocal.
+    // If "tgt" is of the form IND -> ADDR -> GT_LCL_VAR, clasify as CWBKind_AddrOfLocal.
+    // Otherwise, classify as CWBKind_Unclassified.
+
+    CheckedWriteBarrierKinds wbKind = CWBKind_Unclassified;
+    if (tgt->gtOper == GT_IND)
+    {
+        GenTreePtr lcl = NULL;
+
+        GenTreePtr indArg = tgt->gtOp.gtOp1;
+        if (indArg->gtOper == GT_ADDR && indArg->gtOp.gtOp1->gtOper == GT_IND)
+        {
+            indArg = indArg->gtOp.gtOp1->gtOp.gtOp1;
+        }
+        if (indArg->gtOper == GT_LCL_VAR || indArg->gtOper == GT_REG_VAR)
+        {
+            lcl = indArg;
+        }
+        else if (indArg->gtOper == GT_ADD)
+        {
+            if (indArg->gtOp.gtOp1->gtOper == GT_LCL_VAR || indArg->gtOp.gtOp1->gtOper == GT_REG_VAR)
+            {
+                lcl = indArg->gtOp.gtOp1;
+            }
+            else if (indArg->gtOp.gtOp2->gtOper == GT_LCL_VAR || indArg->gtOp.gtOp2->gtOper == GT_REG_VAR)
+            {
+                lcl = indArg->gtOp.gtOp2;
+            }
+        }
+        if (lcl != NULL)
+        {
+            wbKind          = CWBKind_OtherByRefLocal; // Unclassified local variable.
+            unsigned lclNum = 0;
+            if (lcl->gtOper == GT_LCL_VAR)
+                lclNum = lcl->gtLclVarCommon.gtLclNum;
+            else
+            {
+                assert(lcl->gtOper == GT_REG_VAR);
+                lclNum = lcl->gtRegVar.gtLclNum;
+            }
+            if (lclNum == compiler->info.compRetBuffArg)
+            {
+                wbKind = CWBKind_RetBuf; // Ret buff.  Can happen if the struct exceeds the size limit.
+            }
+            else
+            {
+                LclVarDsc* varDsc = &compiler->lvaTable[lclNum];
+                if (varDsc->lvIsParam && varDsc->lvType == TYP_BYREF)
+                {
+                    wbKind = CWBKind_ByRefArg; // Out (or in/out) arg
+                }
+            }
+        }
+        else
+        {
+            // We should have eliminated the barrier for this case.
+            assert(!(indArg->gtOper == GT_ADDR && indArg->gtOp.gtOp1->gtOper == GT_LCL_VAR));
+        }
+    }
+
+    if (helper == CORINFO_HELP_CHECKED_ASSIGN_REF)
+    {
+#if 0
+#ifdef DEBUG
+        // Enable this to sample the unclassified trees.
+        static int unclassifiedBarrierSite = 0;
+        if (wbKind == CWBKind_Unclassified)
+        {
+            unclassifiedBarrierSite++;
+            printf("unclassifiedBarrierSite = %d:\n", unclassifiedBarrierSite); compiler->gtDispTree(tgt); printf(""); printf("\n");
+        }
+#endif // DEBUG
+#endif // 0
+        genStackLevel += 4;
+        inst_IV(INS_push, wbKind);
+        genEmitHelperCall(helper,
+                          4,           // argSize
+                          EA_PTRSIZE); // retSize
+        genStackLevel -= 4;
+    }
+    else
+    {
+        genEmitHelperCall(helper,
+                          0,           // argSize
+                          EA_PTRSIZE); // retSize
+    }
+
+#else  // !FEATURE_COUNT_GC_WRITE_BARRIERS
+    genEmitHelperCall(helper,
+                      0,           // argSize
+                      EA_PTRSIZE); // retSize
+#endif // !FEATURE_COUNT_GC_WRITE_BARRIERS
+}
+
+/*
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                           Prolog / Epilog                                 XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+/*****************************************************************************
+ *
+ *  Generates code for moving incoming register arguments to their
+ *  assigned location, in the function prolog.
+ */
+
+#ifdef _PREFAST_
+#pragma warning(push)
+#pragma warning(disable : 21000) // Suppress PREFast warning about overly large function
+#endif
+void CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg, bool* pXtraRegClobbered, RegState* regState)
+{
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** In genFnPrologCalleeRegArgs() for %s regs\n", regState->rsIsFloat ? "float" : "int");
+    }
+#endif
+
+#ifdef _TARGET_ARM64_
+    if (compiler->info.compIsVarArgs)
+    {
+        // We've already saved all int registers at the top of stack in the prolog.
+        // No need further action.
+        return;
+    }
+#endif
+
+    unsigned  argMax;           // maximum argNum value plus 1, (including the RetBuffArg)
+    unsigned  argNum;           // current argNum, always in [0..argMax-1]
+    unsigned  fixedRetBufIndex; // argNum value used by the fixed return buffer argument (ARM64)
+    unsigned  regArgNum;        // index into the regArgTab[] table
+    regMaskTP regArgMaskLive = regState->rsCalleeRegArgMaskLiveIn;
+    bool      doingFloat     = regState->rsIsFloat;
+
+    // We should be generating the prolog block when we are called
+    assert(compiler->compGeneratingProlog);
+
+    // We expect to have some registers of the type we are doing, that are LiveIn, otherwise we don't need to be called.
+    noway_assert(regArgMaskLive != 0);
+
+    // If a method has 3 args (and no fixed return buffer) then argMax is 3 and valid indexes are 0,1,2
+    // If a method has a fixed return buffer (on ARM64) then argMax gets set to 9 and valid index are 0-8
+    //
+    // The regArgTab can always have unused entries,
+    //    for example if an architecture always increments the arg register number but uses either
+    //    an integer register or a floating point register to hold the next argument
+    //    then with a mix of float and integer args you could have:
+    //
+    //    sampleMethod(int i, float x, int j, float y, int k, float z);
+    //          r0, r2 and r4 as valid integer arguments with argMax as 5
+    //      and f1, f3 and f5 and valid floating point arguments with argMax as 6
+    //    The first one is doingFloat==false and the second one is doingFloat==true
+    //
+    //    If a fixed return buffer (in r8) was also present then the first one would become:
+    //          r0, r2, r4 and r8 as valid integer arguments with argMax as 9
+    //
+
+    argMax           = regState->rsCalleeRegArgCount;
+    fixedRetBufIndex = (unsigned)-1; // Invalid value
+
+    // If necessary we will select a correct xtraReg for circular floating point args later.
+    if (doingFloat)
+    {
+        xtraReg = REG_NA;
+        noway_assert(argMax <= MAX_FLOAT_REG_ARG);
+    }
+    else // we are doing the integer registers
+    {
+        noway_assert(argMax <= MAX_REG_ARG);
+        if (hasFixedRetBuffReg())
+        {
+            fixedRetBufIndex = theFixedRetBuffArgNum();
+            // We have an additional integer register argument when hasFixedRetBuffReg() is true
+            argMax = fixedRetBufIndex + 1;
+            assert(argMax == (MAX_REG_ARG + 1));
+        }
+    }
+
+    //
+    // Construct a table with the register arguments, for detecting circular and
+    // non-circular dependencies between the register arguments. A dependency is when
+    // an argument register Rn needs to be moved to register Rm that is also an argument
+    // register. The table is constructed in the order the arguments are passed in
+    // registers: the first register argument is in regArgTab[0], the second in
+    // regArgTab[1], etc. Note that on ARM, a TYP_DOUBLE takes two entries, starting
+    // at an even index. The regArgTab is indexed from 0 to argMax - 1.
+    // Note that due to an extra argument register for ARM64 (i.e  theFixedRetBuffReg())
+    // we have increased the allocated size of the regArgTab[] by one.
+    //
+    struct regArgElem
+    {
+        unsigned varNum; // index into compiler->lvaTable[] for this register argument
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        var_types type;   // the Jit type of this regArgTab entry
+#endif                    // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        unsigned trashBy; // index into this regArgTab[] table of the register that will be copied to this register.
+                          // That is, for regArgTab[x].trashBy = y, argument register number 'y' will be copied to
+                          // argument register number 'x'. Only used when circular = true.
+        char slot;        // 0 means the register is not used for a register argument
+                          // 1 means the first part of a register argument
+                          // 2, 3 or 4  means the second,third or fourth part of a multireg argument
+        bool stackArg;    // true if the argument gets homed to the stack
+        bool processed;   // true after we've processed the argument (and it is in its final location)
+        bool circular;    // true if this register participates in a circular dependency loop.
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+        // For UNIX AMD64 struct passing, the type of the register argument slot can differ from
+        // the type of the lclVar in ways that are not ascertainable from lvType.
+        // So, for that case we retain the type of the register in the regArgTab.
+
+        var_types getRegType(Compiler* compiler)
+        {
+            return type; // UNIX_AMD64 implementation
+        }
+
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+        // In other cases, we simply use the type of the lclVar to determine the type of the register.
+        var_types getRegType(Compiler* compiler)
+        {
+            LclVarDsc varDsc = compiler->lvaTable[varNum];
+            // Check if this is an HFA register arg and return the HFA type
+            if (varDsc.lvIsHfaRegArg())
+            {
+                return varDsc.GetHfaType();
+            }
+            return varDsc.lvType;
+        }
+
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+    } regArgTab[max(MAX_REG_ARG + 1, MAX_FLOAT_REG_ARG)] = {};
+
+    unsigned   varNum;
+    LclVarDsc* varDsc;
+    for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->lvaCount; varNum++, varDsc++)
+    {
+        // Is this variable a register arg?
+        if (!varDsc->lvIsParam)
+        {
+            continue;
+        }
+
+        if (!varDsc->lvIsRegArg)
+        {
+            continue;
+        }
+
+        // When we have a promoted struct we have two possible LclVars that can represent the incoming argument
+        // in the regArgTab[], either the original TYP_STRUCT argument or the introduced lvStructField.
+        // We will use the lvStructField if we have a TYPE_INDEPENDENT promoted struct field otherwise
+        // use the the original TYP_STRUCT argument.
+        //
+        if (varDsc->lvPromoted || varDsc->lvIsStructField)
+        {
+            LclVarDsc* parentVarDsc = varDsc;
+            if (varDsc->lvIsStructField)
+            {
+                assert(!varDsc->lvPromoted);
+                parentVarDsc = &compiler->lvaTable[varDsc->lvParentLcl];
+            }
+
+            Compiler::lvaPromotionType promotionType = compiler->lvaGetPromotionType(parentVarDsc);
+
+            if (promotionType == Compiler::PROMOTION_TYPE_INDEPENDENT)
+            {
+                noway_assert(parentVarDsc->lvFieldCnt == 1); // We only handle one field here
+
+                // For register arguments that are independent promoted structs we put the promoted field varNum in the
+                // regArgTab[]
+                if (varDsc->lvPromoted)
+                {
+                    continue;
+                }
+            }
+            else
+            {
+                // For register arguments that are not independent promoted structs we put the parent struct varNum in
+                // the regArgTab[]
+                if (varDsc->lvIsStructField)
+                {
+                    continue;
+                }
+            }
+        }
+
+        var_types regType = varDsc->TypeGet();
+        // Change regType to the HFA type when we have a HFA argument
+        if (varDsc->lvIsHfaRegArg())
+        {
+            regType = varDsc->GetHfaType();
+        }
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (!varTypeIsStruct(regType))
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            // A struct might be passed  partially in XMM register for System V calls.
+            // So a single arg might use both register files.
+            if (isFloatRegType(regType) != doingFloat)
+            {
+                continue;
+            }
+        }
+
+        int slots = 0;
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        if (varTypeIsStruct(varDsc))
+        {
+            CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
+            assert(typeHnd != nullptr);
+            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+            compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+            if (!structDesc.passedInRegisters)
+            {
+                // The var is not passed in registers.
+                continue;
+            }
+
+            unsigned firstRegSlot = 0;
+            for (unsigned slotCounter = 0; slotCounter < structDesc.eightByteCount; slotCounter++)
+            {
+                regNumber regNum = varDsc->lvRegNumForSlot(slotCounter);
+                var_types regType;
+
+#ifdef FEATURE_SIMD
+                // Assumption 1:
+                // RyuJit backend depends on the assumption that on 64-Bit targets Vector3 size is rounded off
+                // to TARGET_POINTER_SIZE and hence Vector3 locals on stack can be treated as TYP_SIMD16 for
+                // reading and writing purposes.  Hence while homing a Vector3 type arg on stack we should
+                // home entire 16-bytes so that the upper-most 4-bytes will be zeroed when written to stack.
+                //
+                // Assumption 2:
+                // RyuJit backend is making another implicit assumption that Vector3 type args when passed in
+                // registers or on stack, the upper most 4-bytes will be zero.
+                //
+                // For P/Invoke return and Reverse P/Invoke argument passing, native compiler doesn't guarantee
+                // that upper 4-bytes of a Vector3 type struct is zero initialized and hence assumption 2 is
+                // invalid.
+                //
+                // RyuJIT x64 Windows: arguments are treated as passed by ref and hence read/written just 12
+                // bytes. In case of Vector3 returns, Caller allocates a zero initialized Vector3 local and
+                // passes it retBuf arg and Callee method writes only 12 bytes to retBuf. For this reason,
+                // there is no need to clear upper 4-bytes of Vector3 type args.
+                //
+                // RyuJIT x64 Unix: arguments are treated as passed by value and read/writen as if TYP_SIMD16.
+                // Vector3 return values are returned two return registers and Caller assembles them into a
+                // single xmm reg. Hence RyuJIT explicitly generates code to clears upper 4-bytes of Vector3
+                // type args in prolog and Vector3 type return value of a call
+
+                if (varDsc->lvType == TYP_SIMD12)
+                {
+                    regType = TYP_DOUBLE;
+                }
+                else
+#endif
+                {
+                    regType = compiler->GetEightByteType(structDesc, slotCounter);
+                }
+
+                regArgNum = genMapRegNumToRegArgNum(regNum, regType);
+
+                if ((!doingFloat && (structDesc.IsIntegralSlot(slotCounter))) ||
+                    (doingFloat && (structDesc.IsSseSlot(slotCounter))))
+                {
+                    // Store the reg for the first slot.
+                    if (slots == 0)
+                    {
+                        firstRegSlot = regArgNum;
+                    }
+
+                    // Bingo - add it to our table
+                    noway_assert(regArgNum < argMax);
+                    noway_assert(regArgTab[regArgNum].slot == 0); // we better not have added it already (there better
+                                                                  // not be multiple vars representing this argument
+                                                                  // register)
+                    regArgTab[regArgNum].varNum = varNum;
+                    regArgTab[regArgNum].slot   = (char)(slotCounter + 1);
+                    regArgTab[regArgNum].type   = regType;
+                    slots++;
+                }
+            }
+
+            if (slots == 0)
+            {
+                continue; // Nothing to do for this regState set.
+            }
+
+            regArgNum = firstRegSlot;
+        }
+        else
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        {
+            // Bingo - add it to our table
+            regArgNum = genMapRegNumToRegArgNum(varDsc->lvArgReg, regType);
+
+            noway_assert(regArgNum < argMax);
+            // We better not have added it already (there better not be multiple vars representing this argument
+            // register)
+            noway_assert(regArgTab[regArgNum].slot == 0);
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            // Set the register type.
+            regArgTab[regArgNum].type = regType;
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+            regArgTab[regArgNum].varNum = varNum;
+            regArgTab[regArgNum].slot   = 1;
+
+            slots = 1;
+
+#if FEATURE_MULTIREG_ARGS
+            if (compiler->lvaIsMultiregStruct(varDsc))
+            {
+                if (varDsc->lvIsHfaRegArg())
+                {
+                    // We have an HFA argument, set slots to the number of registers used
+                    slots = varDsc->lvHfaSlots();
+                }
+                else
+                {
+                    // Currently all non-HFA multireg structs are two registers in size (i.e. two slots)
+                    assert(varDsc->lvSize() == (2 * TARGET_POINTER_SIZE));
+                    // We have a non-HFA multireg argument, set slots to two
+                    slots = 2;
+                }
+
+                // Note that regArgNum+1 represents an argument index not an actual argument register.
+                // see genMapRegArgNumToRegNum(unsigned argNum, var_types type)
+
+                // This is the setup for the rest of a multireg struct arg
+
+                for (int i = 1; i < slots; i++)
+                {
+                    noway_assert((regArgNum + i) < argMax);
+
+                    // We better not have added it already (there better not be multiple vars representing this argument
+                    // register)
+                    noway_assert(regArgTab[regArgNum + i].slot == 0);
+
+                    regArgTab[regArgNum + i].varNum = varNum;
+                    regArgTab[regArgNum + i].slot   = (char)(i + 1);
+                }
+            }
+#endif // FEATURE_MULTIREG_ARGS
+        }
+
+#ifdef _TARGET_ARM_
+        int lclSize = compiler->lvaLclSize(varNum);
+
+        if (lclSize > REGSIZE_BYTES)
+        {
+            unsigned maxRegArgNum = doingFloat ? MAX_FLOAT_REG_ARG : MAX_REG_ARG;
+            slots                 = lclSize / REGSIZE_BYTES;
+            if (regArgNum + slots > maxRegArgNum)
+            {
+                slots = maxRegArgNum - regArgNum;
+            }
+        }
+        C_ASSERT((char)MAX_REG_ARG == MAX_REG_ARG);
+        assert(slots < INT8_MAX);
+        for (char i = 1; i < slots; i++)
+        {
+            regArgTab[regArgNum + i].varNum = varNum;
+            regArgTab[regArgNum + i].slot   = i + 1;
+        }
+#endif // _TARGET_ARM_
+
+        for (int i = 0; i < slots; i++)
+        {
+            regType          = regArgTab[regArgNum + i].getRegType(compiler);
+            regNumber regNum = genMapRegArgNumToRegNum(regArgNum + i, regType);
+
+#if !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            // lvArgReg could be INT or FLOAT reg. So the following assertion doesn't hold.
+            // The type of the register depends on the classification of the first eightbyte
+            // of the struct. For information on classification refer to the System V x86_64 ABI at:
+            // http://www.x86-64.org/documentation/abi.pdf
+
+            assert((i > 0) || (regNum == varDsc->lvArgReg));
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            // Is the arg dead on entry to the method ?
+
+            if ((regArgMaskLive & genRegMask(regNum)) == 0)
+            {
+                if (varDsc->lvTrackedNonStruct())
+                {
+                    noway_assert(!VarSetOps::IsMember(compiler, compiler->fgFirstBB->bbLiveIn, varDsc->lvVarIndex));
+                }
+                else
+                {
+#ifdef _TARGET_X86_
+                    noway_assert(varDsc->lvType == TYP_STRUCT);
+#else // !_TARGET_X86_
+#ifndef LEGACY_BACKEND
+                    // For LSRA, it may not be in regArgMaskLive if it has a zero
+                    // refcnt.  This is in contrast with the non-LSRA case in which all
+                    // non-tracked args are assumed live on entry.
+                    noway_assert((varDsc->lvRefCnt == 0) || (varDsc->lvType == TYP_STRUCT) ||
+                                 (varDsc->lvAddrExposed && compiler->info.compIsVarArgs));
+#else  // LEGACY_BACKEND
+                    noway_assert(
+                        varDsc->lvType == TYP_STRUCT ||
+                        (varDsc->lvAddrExposed && (compiler->info.compIsVarArgs || compiler->opts.compUseSoftFP)));
+#endif // LEGACY_BACKEND
+#endif // !_TARGET_X86_
+                }
+                // Mark it as processed and be done with it
+                regArgTab[regArgNum + i].processed = true;
+                goto NON_DEP;
+            }
+
+#ifdef _TARGET_ARM_
+            // On the ARM when the varDsc is a struct arg (or pre-spilled due to varargs) the initReg/xtraReg
+            // could be equal to lvArgReg. The pre-spilled registers are also not considered live either since
+            // they've already been spilled.
+            //
+            if ((regSet.rsMaskPreSpillRegs(false) & genRegMask(regNum)) == 0)
+#endif // _TARGET_ARM_
+            {
+                noway_assert(xtraReg != varDsc->lvArgReg + i);
+                noway_assert(regArgMaskLive & genRegMask(regNum));
+            }
+
+            regArgTab[regArgNum + i].processed = false;
+
+            /* mark stack arguments since we will take care of those first */
+            regArgTab[regArgNum + i].stackArg = (varDsc->lvIsInReg()) ? false : true;
+
+            /* If it goes on the stack or in a register that doesn't hold
+             * an argument anymore -> CANNOT form a circular dependency */
+
+            if (varDsc->lvIsInReg() && (genRegMask(regNum) & regArgMaskLive))
+            {
+                /* will trash another argument -> possible dependency
+                 * We may need several passes after the table is constructed
+                 * to decide on that */
+
+                /* Maybe the argument stays in the register (IDEAL) */
+
+                if ((i == 0) && (varDsc->lvRegNum == regNum))
+                {
+                    goto NON_DEP;
+                }
+
+#if !defined(_TARGET_64BIT_)
+                if ((i == 1) && varTypeIsStruct(varDsc) && (varDsc->lvOtherReg == regNum))
+                {
+                    goto NON_DEP;
+                }
+                if ((i == 1) && (genActualType(varDsc->TypeGet()) == TYP_LONG) && (varDsc->lvOtherReg == regNum))
+                {
+                    goto NON_DEP;
+                }
+
+                if ((i == 1) && (genActualType(varDsc->TypeGet()) == TYP_DOUBLE) &&
+                    (REG_NEXT(varDsc->lvRegNum) == regNum))
+                {
+                    goto NON_DEP;
+                }
+#endif // !defined(_TARGET_64BIT_)
+                regArgTab[regArgNum + i].circular = true;
+            }
+            else
+            {
+            NON_DEP:
+                regArgTab[regArgNum + i].circular = false;
+
+                /* mark the argument register as free */
+                regArgMaskLive &= ~genRegMask(regNum);
+            }
+        }
+    }
+
+    /* Find the circular dependencies for the argument registers, if any.
+     * A circular dependency is a set of registers R1, R2, ..., Rn
+     * such that R1->R2 (that is, R1 needs to be moved to R2), R2->R3, ..., Rn->R1 */
+
+    bool change = true;
+    if (regArgMaskLive)
+    {
+        /* Possible circular dependencies still exist; the previous pass was not enough
+         * to filter them out. Use a "sieve" strategy to find all circular dependencies. */
+
+        while (change)
+        {
+            change = false;
+
+            for (argNum = 0; argNum < argMax; argNum++)
+            {
+                // If we already marked the argument as non-circular then continue
+
+                if (!regArgTab[argNum].circular)
+                {
+                    continue;
+                }
+
+                if (regArgTab[argNum].slot == 0) // Not a register argument
+                {
+                    continue;
+                }
+
+                varNum = regArgTab[argNum].varNum;
+                noway_assert(varNum < compiler->lvaCount);
+                varDsc = compiler->lvaTable + varNum;
+                noway_assert(varDsc->lvIsParam && varDsc->lvIsRegArg);
+
+                /* cannot possibly have stack arguments */
+                noway_assert(varDsc->lvIsInReg());
+                noway_assert(!regArgTab[argNum].stackArg);
+
+                var_types regType = regArgTab[argNum].getRegType(compiler);
+                regNumber regNum  = genMapRegArgNumToRegNum(argNum, regType);
+
+                regNumber destRegNum = REG_NA;
+                if (regArgTab[argNum].slot == 1)
+                {
+                    destRegNum = varDsc->lvRegNum;
+                }
+#if FEATURE_MULTIREG_ARGS && defined(FEATURE_SIMD) && defined(_TARGET_AMD64_)
+                else
+                {
+                    assert(regArgTab[argNum].slot == 2);
+                    assert(argNum > 0);
+                    assert(regArgTab[argNum - 1].slot == 1);
+                    assert(regArgTab[argNum - 1].varNum == varNum);
+                    assert((varDsc->lvType == TYP_SIMD12) || (varDsc->lvType == TYP_SIMD16));
+                    regArgMaskLive &= ~genRegMask(regNum);
+                    regArgTab[argNum].circular = false;
+                    change                     = true;
+                    continue;
+                }
+#elif !defined(_TARGET_64BIT_)
+                else if (regArgTab[argNum].slot == 2 && genActualType(varDsc->TypeGet()) == TYP_LONG)
+                {
+                    destRegNum = varDsc->lvOtherReg;
+                }
+                else
+                {
+                    assert(regArgTab[argNum].slot == 2);
+                    assert(varDsc->TypeGet() == TYP_DOUBLE);
+                    destRegNum = REG_NEXT(varDsc->lvRegNum);
+                }
+#endif // !defined(_TARGET_64BIT_)
+                noway_assert(destRegNum != REG_NA);
+                if (genRegMask(destRegNum) & regArgMaskLive)
+                {
+                    /* we are trashing a live argument register - record it */
+                    unsigned destRegArgNum = genMapRegNumToRegArgNum(destRegNum, regType);
+                    noway_assert(destRegArgNum < argMax);
+                    regArgTab[destRegArgNum].trashBy = argNum;
+                }
+                else
+                {
+                    /* argument goes to a free register */
+                    regArgTab[argNum].circular = false;
+                    change                     = true;
+
+                    /* mark the argument register as free */
+                    regArgMaskLive &= ~genRegMask(regNum);
+                }
+            }
+        }
+    }
+
+    /* At this point, everything that has the "circular" flag
+     * set to "true" forms a circular dependency */
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef DEBUG
+    if (regArgMaskLive)
+    {
+        if (verbose)
+        {
+            printf("Circular dependencies found while home-ing the incoming arguments.\n");
+        }
+    }
+#endif
+
+    // LSRA allocates registers to incoming parameters in order and will not overwrite
+    // a register still holding a live parameter.
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifndef LEGACY_BACKEND
+    noway_assert(((regArgMaskLive & RBM_FLTARG_REGS) == 0) &&
+                 "Homing of float argument registers with circular dependencies not implemented.");
+#endif // LEGACY_BACKEND
+
+    /* Now move the arguments to their locations.
+     * First consider ones that go on the stack since they may
+     * free some registers. */
+
+    regArgMaskLive = regState->rsCalleeRegArgMaskLiveIn; // reset the live in to what it was at the start
+    for (argNum = 0; argNum < argMax; argNum++)
+    {
+        emitAttr size;
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        // If this is the wrong register file, just continue.
+        if (regArgTab[argNum].type == TYP_UNDEF)
+        {
+            // This could happen if the reg in regArgTab[argNum] is of the other register file -
+            //     for System V register passed structs where the first reg is GPR and the second an XMM reg.
+            // The next register file processing will process it.
+            continue;
+        }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+        // If the arg is dead on entry to the method, skip it
+
+        if (regArgTab[argNum].processed)
+        {
+            continue;
+        }
+
+        if (regArgTab[argNum].slot == 0) // Not a register argument
+        {
+            continue;
+        }
+
+        varNum = regArgTab[argNum].varNum;
+        noway_assert(varNum < compiler->lvaCount);
+        varDsc = compiler->lvaTable + varNum;
+
+#ifndef _TARGET_64BIT_
+        // If not a stack arg go to the next one
+        if (varDsc->lvType == TYP_LONG)
+        {
+            if (regArgTab[argNum].slot == 1 && !regArgTab[argNum].stackArg)
+            {
+                continue;
+            }
+            else if (varDsc->lvOtherReg != REG_STK)
+            {
+                continue;
+            }
+        }
+        else
+#endif // !_TARGET_64BIT_
+        {
+            // If not a stack arg go to the next one
+            if (!regArgTab[argNum].stackArg)
+            {
+                continue;
+            }
+        }
+
+#if defined(_TARGET_ARM_)
+        if (varDsc->lvType == TYP_DOUBLE)
+        {
+            if (regArgTab[argNum].slot == 2)
+            {
+                // We handled the entire double when processing the first half (slot == 1)
+                continue;
+            }
+        }
+#endif
+
+        noway_assert(regArgTab[argNum].circular == false);
+
+        noway_assert(varDsc->lvIsParam);
+        noway_assert(varDsc->lvIsRegArg);
+        noway_assert(varDsc->lvIsInReg() == false ||
+                     (varDsc->lvType == TYP_LONG && varDsc->lvOtherReg == REG_STK && regArgTab[argNum].slot == 2));
+
+        var_types storeType = TYP_UNDEF;
+        unsigned  slotSize  = TARGET_POINTER_SIZE;
+
+        if (varTypeIsStruct(varDsc))
+        {
+            storeType = TYP_I_IMPL; // Default store type for a struct type is a pointer sized integer
+#if FEATURE_MULTIREG_ARGS
+            // Must be <= MAX_PASS_MULTIREG_BYTES or else it wouldn't be passed in registers
+            noway_assert(varDsc->lvSize() <= MAX_PASS_MULTIREG_BYTES);
+#endif // FEATURE_MULTIREG_ARGS
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            storeType = regArgTab[argNum].type;
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (varDsc->lvIsHfaRegArg())
+            {
+#ifdef _TARGET_ARM_
+                // On ARM32 the storeType for HFA args is always TYP_FLOAT
+                storeType = TYP_FLOAT;
+                slotSize  = (unsigned)emitActualTypeSize(storeType);
+#else  // _TARGET_ARM64_
+                storeType = genActualType(varDsc->GetHfaType());
+                slotSize  = (unsigned)emitActualTypeSize(storeType);
+#endif // _TARGET_ARM64_
+            }
+        }
+        else // Not a struct type
+        {
+            storeType = genActualType(varDsc->TypeGet());
+        }
+        size = emitActualTypeSize(storeType);
+#ifdef _TARGET_X86_
+        noway_assert(genTypeSize(storeType) == TARGET_POINTER_SIZE);
+#endif //_TARGET_X86_
+
+        regNumber srcRegNum = genMapRegArgNumToRegNum(argNum, storeType);
+
+        // Stack argument - if the ref count is 0 don't care about it
+
+        if (!varDsc->lvOnFrame)
+        {
+            noway_assert(varDsc->lvRefCnt == 0);
+        }
+        else
+        {
+            // Since slot is typically 1, baseOffset is typically 0
+            int baseOffset = (regArgTab[argNum].slot - 1) * slotSize;
+
+            getEmitter()->emitIns_S_R(ins_Store(storeType), size, srcRegNum, varNum, baseOffset);
+
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            // Check if we are writing past the end of the struct
+            if (varTypeIsStruct(varDsc))
+            {
+                assert(varDsc->lvSize() >= baseOffset + (unsigned)size);
+            }
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+            if (regArgTab[argNum].slot == 1)
+            {
+                psiMoveToStack(varNum);
+            }
+        }
+
+        /* mark the argument as processed */
+
+        regArgTab[argNum].processed = true;
+        regArgMaskLive &= ~genRegMask(srcRegNum);
+
+#if defined(_TARGET_ARM_)
+        if (storeType == TYP_DOUBLE)
+        {
+            regArgTab[argNum + 1].processed = true;
+            regArgMaskLive &= ~genRegMask(REG_NEXT(srcRegNum));
+        }
+#endif
+    }
+
+    /* Process any circular dependencies */
+    if (regArgMaskLive)
+    {
+        unsigned    begReg, destReg, srcReg;
+        unsigned    varNumDest, varNumSrc;
+        LclVarDsc*  varDscDest;
+        LclVarDsc*  varDscSrc;
+        instruction insCopy = INS_mov;
+
+        if (doingFloat)
+        {
+#if defined(FEATURE_HFA) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            insCopy = ins_Copy(TYP_DOUBLE);
+            // Compute xtraReg here when we have a float argument
+            assert(xtraReg == REG_NA);
+
+            regMaskTP fpAvailMask;
+
+            fpAvailMask = RBM_FLT_CALLEE_TRASH & ~regArgMaskLive;
+#if defined(FEATURE_HFA)
+            fpAvailMask &= RBM_ALLDOUBLE;
+#else
+#if !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#error Error. Wrong architecture.
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#endif // defined(FEATURE_HFA)
+
+            if (fpAvailMask == RBM_NONE)
+            {
+                fpAvailMask = RBM_ALLFLOAT & ~regArgMaskLive;
+#if defined(FEATURE_HFA)
+                fpAvailMask &= RBM_ALLDOUBLE;
+#else
+#if !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#error Error. Wrong architecture.
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#endif // defined(FEATURE_HFA)
+            }
+
+            assert(fpAvailMask != RBM_NONE);
+
+            // We pick the lowest avail register number
+            regMaskTP tempMask = genFindLowestBit(fpAvailMask);
+            xtraReg            = genRegNumFromMask(tempMask);
+#elif defined(_TARGET_X86_)
+            // This case shouldn't occur on x86 since NYI gets converted to an assert
+            NYI("Homing circular FP registers via xtraReg");
+#endif
+        }
+
+        for (argNum = 0; argNum < argMax; argNum++)
+        {
+            // If not a circular dependency then continue
+            if (!regArgTab[argNum].circular)
+            {
+                continue;
+            }
+
+            // If already processed the dependency then continue
+
+            if (regArgTab[argNum].processed)
+            {
+                continue;
+            }
+
+            if (regArgTab[argNum].slot == 0) // Not a register argument
+            {
+                continue;
+            }
+
+            destReg = begReg = argNum;
+            srcReg           = regArgTab[argNum].trashBy;
+
+            varNumDest = regArgTab[destReg].varNum;
+            noway_assert(varNumDest < compiler->lvaCount);
+            varDscDest = compiler->lvaTable + varNumDest;
+            noway_assert(varDscDest->lvIsParam && varDscDest->lvIsRegArg);
+
+            noway_assert(srcReg < argMax);
+            varNumSrc = regArgTab[srcReg].varNum;
+            noway_assert(varNumSrc < compiler->lvaCount);
+            varDscSrc = compiler->lvaTable + varNumSrc;
+            noway_assert(varDscSrc->lvIsParam && varDscSrc->lvIsRegArg);
+
+            emitAttr size = EA_PTRSIZE;
+
+#ifdef _TARGET_XARCH_
+            //
+            // The following code relies upon the target architecture having an
+            // 'xchg' instruction which directly swaps the values held in two registers.
+            // On the ARM architecture we do not have such an instruction.
+            //
+            if (destReg == regArgTab[srcReg].trashBy)
+            {
+                /* only 2 registers form the circular dependency - use "xchg" */
+
+                varNum = regArgTab[argNum].varNum;
+                noway_assert(varNum < compiler->lvaCount);
+                varDsc = compiler->lvaTable + varNum;
+                noway_assert(varDsc->lvIsParam && varDsc->lvIsRegArg);
+
+                noway_assert(genTypeSize(genActualType(varDscSrc->TypeGet())) <= REGSIZE_BYTES);
+
+                /* Set "size" to indicate GC if one and only one of
+                 * the operands is a pointer
+                 * RATIONALE: If both are pointers, nothing changes in
+                 * the GC pointer tracking. If only one is a pointer we
+                 * have to "swap" the registers in the GC reg pointer mask
+                 */
+
+                if (varTypeGCtype(varDscSrc->TypeGet()) != varTypeGCtype(varDscDest->TypeGet()))
+                {
+                    size = EA_GCREF;
+                }
+
+                noway_assert(varDscDest->lvArgReg == varDscSrc->lvRegNum);
+
+                getEmitter()->emitIns_R_R(INS_xchg, size, varDscSrc->lvRegNum, varDscSrc->lvArgReg);
+                regTracker.rsTrackRegTrash(varDscSrc->lvRegNum);
+                regTracker.rsTrackRegTrash(varDscSrc->lvArgReg);
+
+                /* mark both arguments as processed */
+                regArgTab[destReg].processed = true;
+                regArgTab[srcReg].processed  = true;
+
+                regArgMaskLive &= ~genRegMask(varDscSrc->lvArgReg);
+                regArgMaskLive &= ~genRegMask(varDscDest->lvArgReg);
+
+                psiMoveToReg(varNumSrc);
+                psiMoveToReg(varNumDest);
+            }
+            else
+#endif // _TARGET_XARCH_
+            {
+                var_types destMemType = varDscDest->TypeGet();
+
+#ifdef _TARGET_ARM_
+                bool cycleAllDouble = true; // assume the best
+
+                unsigned iter = begReg;
+                do
+                {
+                    if (compiler->lvaTable[regArgTab[iter].varNum].TypeGet() != TYP_DOUBLE)
+                    {
+                        cycleAllDouble = false;
+                        break;
+                    }
+                    iter = regArgTab[iter].trashBy;
+                } while (iter != begReg);
+
+                // We may treat doubles as floats for ARM because we could have partial circular
+                // dependencies of a float with a lo/hi part of the double. We mark the
+                // trashBy values for each slot of the double, so let the circular dependency
+                // logic work its way out for floats rather than doubles. If a cycle has all
+                // doubles, then optimize so that instead of two vmov.f32's to move a double,
+                // we can use one vmov.f64.
+                //
+                if (!cycleAllDouble && destMemType == TYP_DOUBLE)
+                {
+                    destMemType = TYP_FLOAT;
+                }
+#endif // _TARGET_ARM_
+
+                if (destMemType == TYP_REF)
+                {
+                    size = EA_GCREF;
+                }
+                else if (destMemType == TYP_BYREF)
+                {
+                    size = EA_BYREF;
+                }
+                else if (destMemType == TYP_DOUBLE)
+                {
+                    size = EA_8BYTE;
+                }
+                else if (destMemType == TYP_FLOAT)
+                {
+                    size = EA_4BYTE;
+                }
+
+                /* move the dest reg (begReg) in the extra reg */
+
+                assert(xtraReg != REG_NA);
+
+                regNumber begRegNum = genMapRegArgNumToRegNum(begReg, destMemType);
+
+                getEmitter()->emitIns_R_R(insCopy, size, xtraReg, begRegNum);
+
+                regTracker.rsTrackRegCopy(xtraReg, begRegNum);
+
+                *pXtraRegClobbered = true;
+
+                psiMoveToReg(varNumDest, xtraReg);
+
+                /* start moving everything to its right place */
+
+                while (srcReg != begReg)
+                {
+                    /* mov dest, src */
+
+                    regNumber destRegNum = genMapRegArgNumToRegNum(destReg, destMemType);
+                    regNumber srcRegNum  = genMapRegArgNumToRegNum(srcReg, destMemType);
+
+                    getEmitter()->emitIns_R_R(insCopy, size, destRegNum, srcRegNum);
+
+                    regTracker.rsTrackRegCopy(destRegNum, srcRegNum);
+
+                    /* mark 'src' as processed */
+                    noway_assert(srcReg < argMax);
+                    regArgTab[srcReg].processed = true;
+#ifdef _TARGET_ARM_
+                    if (size == EA_8BYTE)
+                        regArgTab[srcReg + 1].processed = true;
+#endif
+                    regArgMaskLive &= ~genMapArgNumToRegMask(srcReg, destMemType);
+
+                    /* move to the next pair */
+                    destReg = srcReg;
+                    srcReg  = regArgTab[srcReg].trashBy;
+
+                    varDscDest  = varDscSrc;
+                    destMemType = varDscDest->TypeGet();
+#ifdef _TARGET_ARM_
+                    if (!cycleAllDouble && destMemType == TYP_DOUBLE)
+                    {
+                        destMemType = TYP_FLOAT;
+                    }
+#endif
+                    varNumSrc = regArgTab[srcReg].varNum;
+                    noway_assert(varNumSrc < compiler->lvaCount);
+                    varDscSrc = compiler->lvaTable + varNumSrc;
+                    noway_assert(varDscSrc->lvIsParam && varDscSrc->lvIsRegArg);
+
+                    if (destMemType == TYP_REF)
+                    {
+                        size = EA_GCREF;
+                    }
+                    else if (destMemType == TYP_DOUBLE)
+                    {
+                        size = EA_8BYTE;
+                    }
+                    else
+                    {
+                        size = EA_4BYTE;
+                    }
+                }
+
+                /* take care of the beginning register */
+
+                noway_assert(srcReg == begReg);
+
+                /* move the dest reg (begReg) in the extra reg */
+
+                regNumber destRegNum = genMapRegArgNumToRegNum(destReg, destMemType);
+
+                getEmitter()->emitIns_R_R(insCopy, size, destRegNum, xtraReg);
+
+                regTracker.rsTrackRegCopy(destRegNum, xtraReg);
+
+                psiMoveToReg(varNumSrc);
+
+                /* mark the beginning register as processed */
+
+                regArgTab[srcReg].processed = true;
+#ifdef _TARGET_ARM_
+                if (size == EA_8BYTE)
+                    regArgTab[srcReg + 1].processed = true;
+#endif
+                regArgMaskLive &= ~genMapArgNumToRegMask(srcReg, destMemType);
+            }
+        }
+    }
+
+    /* Finally take care of the remaining arguments that must be enregistered */
+    while (regArgMaskLive)
+    {
+        regMaskTP regArgMaskLiveSave = regArgMaskLive;
+
+        for (argNum = 0; argNum < argMax; argNum++)
+        {
+            /* If already processed go to the next one */
+            if (regArgTab[argNum].processed)
+            {
+                continue;
+            }
+
+            if (regArgTab[argNum].slot == 0)
+            { // Not a register argument
+                continue;
+            }
+
+            varNum = regArgTab[argNum].varNum;
+            noway_assert(varNum < compiler->lvaCount);
+            varDsc            = compiler->lvaTable + varNum;
+            var_types regType = regArgTab[argNum].getRegType(compiler);
+            regNumber regNum  = genMapRegArgNumToRegNum(argNum, regType);
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+            if (regType == TYP_UNDEF)
+            {
+                // This could happen if the reg in regArgTab[argNum] is of the other register file -
+                // for System V register passed structs where the first reg is GPR and the second an XMM reg.
+                // The next register file processing will process it.
+                regArgMaskLive &= ~genRegMask(regNum);
+                continue;
+            }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+            noway_assert(varDsc->lvIsParam && varDsc->lvIsRegArg);
+#ifndef _TARGET_64BIT_
+#ifndef _TARGET_ARM_
+            // Right now we think that incoming arguments are not pointer sized.  When we eventually
+            // understand the calling convention, this still won't be true. But maybe we'll have a better
+            // idea of how to ignore it.
+
+            // On Arm, a long can be passed in register
+            noway_assert(genTypeSize(genActualType(varDsc->TypeGet())) == sizeof(void*));
+#endif
+#endif //_TARGET_64BIT_
+
+            noway_assert(varDsc->lvIsInReg() && !regArgTab[argNum].circular);
+
+            /* Register argument - hopefully it stays in the same register */
+            regNumber destRegNum  = REG_NA;
+            var_types destMemType = varDsc->TypeGet();
+
+            if (regArgTab[argNum].slot == 1)
+            {
+                destRegNum = varDsc->lvRegNum;
+
+#ifdef _TARGET_ARM_
+                if (genActualType(destMemType) == TYP_DOUBLE && regArgTab[argNum + 1].processed)
+                {
+                    // The second half of the double has already been processed! Treat this as a single.
+                    destMemType = TYP_FLOAT;
+                }
+#endif // _TARGET_ARM_
+            }
+#ifndef _TARGET_64BIT_
+            else if (regArgTab[argNum].slot == 2 && genActualType(destMemType) == TYP_LONG)
+            {
+#ifndef LEGACY_BACKEND
+                assert(genActualType(varDsc->TypeGet()) == TYP_LONG || genActualType(varDsc->TypeGet()) == TYP_DOUBLE);
+                if (genActualType(varDsc->TypeGet()) == TYP_DOUBLE)
+                {
+                    destRegNum = regNum;
+                }
+                else
+#endif // !LEGACY_BACKEND
+                    destRegNum = varDsc->lvOtherReg;
+
+                assert(destRegNum != REG_STK);
+            }
+            else
+            {
+                assert(regArgTab[argNum].slot == 2);
+                assert(destMemType == TYP_DOUBLE);
+
+                // For doubles, we move the entire double using the argNum representing
+                // the first half of the double. There are two things we won't do:
+                // (1) move the double when the 1st half of the destination is free but the
+                // 2nd half is occupied, and (2) move the double when the 2nd half of the
+                // destination is free but the 1st half is occupied. Here we consider the
+                // case where the first half can't be moved initially because its target is
+                // still busy, but the second half can be moved. We wait until the entire
+                // double can be moved, if possible. For example, we have F0/F1 double moving to F2/F3,
+                // and F2 single moving to F16. When we process F0, its target F2 is busy,
+                // so we skip it on the first pass. When we process F1, its target F3 is
+                // available. However, we want to move F0/F1 all at once, so we skip it here.
+                // We process F2, which frees up F2. The next pass through, we process F0 and
+                // F2/F3 are empty, so we move it. Note that if half of a double is involved
+                // in a circularity with a single, then we will have already moved that half
+                // above, so we go ahead and move the remaining half as a single.
+                // Because there are no circularities left, we are guaranteed to terminate.
+
+                assert(argNum > 0);
+                assert(regArgTab[argNum - 1].slot == 1);
+
+                if (!regArgTab[argNum - 1].processed)
+                {
+                    // The first half of the double hasn't been processed; try to be processed at the same time
+                    continue;
+                }
+
+                // The first half of the double has been processed but the second half hasn't!
+                // This could happen for double F2/F3 moving to F0/F1, and single F0 moving to F2.
+                // In that case, there is a F0/F2 loop that is not a double-only loop. The circular
+                // dependency logic above will move them as singles, leaving just F3 to move. Treat
+                // it as a single to finish the shuffling.
+
+                destMemType = TYP_FLOAT;
+                destRegNum  = REG_NEXT(varDsc->lvRegNum);
+            }
+#endif // !_TARGET_64BIT_
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) && defined(FEATURE_SIMD)
+            else
+            {
+                assert(regArgTab[argNum].slot == 2);
+                assert(argNum > 0);
+                assert(regArgTab[argNum - 1].slot == 1);
+                assert((varDsc->lvType == TYP_SIMD12) || (varDsc->lvType == TYP_SIMD16));
+                destRegNum = varDsc->lvRegNum;
+                noway_assert(regNum != destRegNum);
+                continue;
+            }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) && defined(FEATURE_SIMD)
+            noway_assert(destRegNum != REG_NA);
+            if (destRegNum != regNum)
+            {
+                /* Cannot trash a currently live register argument.
+                 * Skip this one until its target will be free
+                 * which is guaranteed to happen since we have no circular dependencies. */
+
+                regMaskTP destMask = genRegMask(destRegNum);
+#ifdef _TARGET_ARM_
+                // Don't process the double until both halves of the destination are clear.
+                if (genActualType(destMemType) == TYP_DOUBLE)
+                {
+                    assert((destMask & RBM_DBL_REGS) != 0);
+                    destMask |= genRegMask(REG_NEXT(destRegNum));
+                }
+#endif
+
+                if (destMask & regArgMaskLive)
+                {
+                    continue;
+                }
+
+                /* Move it to the new register */
+
+                emitAttr size = emitActualTypeSize(destMemType);
+
+                getEmitter()->emitIns_R_R(ins_Copy(destMemType), size, destRegNum, regNum);
+
+                psiMoveToReg(varNum);
+            }
+
+            /* mark the argument as processed */
+
+            assert(!regArgTab[argNum].processed);
+            regArgTab[argNum].processed = true;
+            regArgMaskLive &= ~genRegMask(regNum);
+#if FEATURE_MULTIREG_ARGS
+            int argRegCount = 1;
+#ifdef _TARGET_ARM_
+            if (genActualType(destMemType) == TYP_DOUBLE)
+            {
+                argRegCount = 2;
+            }
+#endif
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) && defined(FEATURE_SIMD)
+            if (varTypeIsStruct(varDsc) && argNum < (argMax - 1) && regArgTab[argNum + 1].slot == 2)
+            {
+                argRegCount          = 2;
+                int       nextArgNum = argNum + 1;
+                regNumber nextRegNum = genMapRegArgNumToRegNum(nextArgNum, regArgTab[nextArgNum].getRegType(compiler));
+                noway_assert(regArgTab[nextArgNum].varNum == varNum);
+                // Emit a shufpd with a 0 immediate, which preserves the 0th element of the dest reg
+                // and moves the 0th element of the src reg into the 1st element of the dest reg.
+                getEmitter()->emitIns_R_R_I(INS_shufpd, emitActualTypeSize(varDsc->lvType), destRegNum, nextRegNum, 0);
+                // Set destRegNum to regNum so that we skip the setting of the register below,
+                // but mark argNum as processed and clear regNum from the live mask.
+                destRegNum = regNum;
+            }
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) && defined(FEATURE_SIMD)
+            // Mark the rest of the argument registers corresponding to this multi-reg type as
+            // being processed and no longer live.
+            for (int regSlot = 1; regSlot < argRegCount; regSlot++)
+            {
+                int nextArgNum = argNum + regSlot;
+                assert(!regArgTab[nextArgNum].processed);
+                regArgTab[nextArgNum].processed = true;
+                regNumber nextRegNum = genMapRegArgNumToRegNum(nextArgNum, regArgTab[nextArgNum].getRegType(compiler));
+                regArgMaskLive &= ~genRegMask(nextRegNum);
+            }
+#endif // FEATURE_MULTIREG_ARGS
+        }
+
+        noway_assert(regArgMaskLiveSave != regArgMaskLive); // if it doesn't change, we have an infinite loop
+    }
+}
+#ifdef _PREFAST_
+#pragma warning(pop)
+#endif
+
+/*****************************************************************************
+ * If any incoming stack arguments live in registers, load them.
+ */
+void CodeGen::genEnregisterIncomingStackArgs()
+{
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** In genEnregisterIncomingStackArgs()\n");
+    }
+#endif
+
+    assert(compiler->compGeneratingProlog);
+
+    unsigned varNum = 0;
+
+    for (LclVarDsc *varDsc = compiler->lvaTable; varNum < compiler->lvaCount; varNum++, varDsc++)
+    {
+        /* Is this variable a parameter? */
+
+        if (!varDsc->lvIsParam)
+        {
+            continue;
+        }
+
+        /* If it's a register argument then it's already been taken care of.
+           But, on Arm when under a profiler, we would have prespilled a register argument
+           and hence here we need to load it from its prespilled location.
+        */
+        bool isPrespilledForProfiling = false;
+#if defined(_TARGET_ARM_) && defined(PROFILING_SUPPORTED)
+        isPrespilledForProfiling =
+            compiler->compIsProfilerHookNeeded() && compiler->lvaIsPreSpilled(varNum, regSet.rsMaskPreSpillRegs(false));
+#endif
+
+        if (varDsc->lvIsRegArg && !isPrespilledForProfiling)
+        {
+            continue;
+        }
+
+        /* Has the parameter been assigned to a register? */
+
+        if (!varDsc->lvIsInReg())
+        {
+            continue;
+        }
+
+        var_types type = genActualType(varDsc->TypeGet());
+
+#if FEATURE_STACK_FP_X87
+        // Floating point locals are loaded onto the x86-FPU in the next section
+        if (varTypeIsFloating(type))
+            continue;
+#endif
+
+        /* Is the variable dead on entry */
+
+        if (!VarSetOps::IsMember(compiler, compiler->fgFirstBB->bbLiveIn, varDsc->lvVarIndex))
+        {
+            continue;
+        }
+
+        /* Load the incoming parameter into the register */
+
+        /* Figure out the home offset of the incoming argument */
+
+        regNumber regNum;
+        regNumber otherReg;
+
+#ifndef LEGACY_BACKEND
+#ifdef _TARGET_ARM_
+        if (type == TYP_LONG)
+        {
+            regPairNo regPair = varDsc->lvArgInitRegPair;
+            regNum            = genRegPairLo(regPair);
+            otherReg          = genRegPairHi(regPair);
+        }
+        else
+#endif // _TARGET_ARM
+        {
+            regNum   = varDsc->lvArgInitReg;
+            otherReg = REG_NA;
+        }
+#else  // LEGACY_BACKEND
+        regNum = varDsc->lvRegNum;
+        if (type == TYP_LONG)
+        {
+            otherReg = varDsc->lvOtherReg;
+        }
+        else
+        {
+            otherReg = REG_NA;
+        }
+#endif // LEGACY_BACKEND
+
+        assert(regNum != REG_STK);
+
+#ifndef _TARGET_64BIT_
+        if (type == TYP_LONG)
+        {
+            /* long - at least the low half must be enregistered */
+
+            getEmitter()->emitIns_R_S(ins_Load(TYP_INT), EA_4BYTE, regNum, varNum, 0);
+            regTracker.rsTrackRegTrash(regNum);
+
+            /* Is the upper half also enregistered? */
+
+            if (otherReg != REG_STK)
+            {
+                getEmitter()->emitIns_R_S(ins_Load(TYP_INT), EA_4BYTE, otherReg, varNum, sizeof(int));
+                regTracker.rsTrackRegTrash(otherReg);
+            }
+        }
+        else
+#endif // _TARGET_64BIT_
+        {
+            /* Loading a single register - this is the easy/common case */
+
+            getEmitter()->emitIns_R_S(ins_Load(type), emitTypeSize(type), regNum, varNum, 0);
+            regTracker.rsTrackRegTrash(regNum);
+        }
+
+        psiMoveToReg(varNum);
+    }
+}
+
+/*-------------------------------------------------------------------------
+ *
+ *  We have to decide whether we're going to use block initialization
+ *  in the prolog before we assign final stack offsets. This is because
+ *  when using block initialization we may need additional callee-saved
+ *  registers which need to be saved on the frame, thus increasing the
+ *  frame size.
+ *
+ *  We'll count the number of locals we have to initialize,
+ *  and if there are lots of them we'll use block initialization.
+ *  Thus, the local variable table must have accurate register location
+ *  information for enregistered locals for their register state on entry
+ *  to the function.
+ *
+ *  At the same time we set lvMustInit for locals (enregistered or on stack)
+ *  that must be initialized (e.g. initialize memory (comInitMem),
+ *  untracked pointers or disable DFA)
+ */
+void CodeGen::genCheckUseBlockInit()
+{
+#ifndef LEGACY_BACKEND // this is called before codegen in RyuJIT backend
+    assert(!compiler->compGeneratingProlog);
+#else  // LEGACY_BACKEND
+    assert(compiler->compGeneratingProlog);
+#endif // LEGACY_BACKEND
+
+    unsigned initStkLclCnt = 0;  // The number of int-sized stack local variables that need to be initialized (variables
+                                 // larger than int count for more than 1).
+    unsigned largeGcStructs = 0; // The number of "large" structs with GC pointers. Used as part of the heuristic to
+                                 // determine whether to use block init.
+
+    unsigned   varNum;
+    LclVarDsc* varDsc;
+
+    for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->lvaCount; varNum++, varDsc++)
+    {
+        if (varDsc->lvIsParam)
+        {
+            continue;
+        }
+
+        if (!varDsc->lvIsInReg() && !varDsc->lvOnFrame)
+        {
+            noway_assert(varDsc->lvRefCnt == 0);
+            continue;
+        }
+
+        if (varNum == compiler->lvaInlinedPInvokeFrameVar || varNum == compiler->lvaStubArgumentVar)
+        {
+            continue;
+        }
+
+#if FEATURE_FIXED_OUT_ARGS
+        if (varNum == compiler->lvaPInvokeFrameRegSaveVar)
+        {
+            continue;
+        }
+        if (varNum == compiler->lvaOutgoingArgSpaceVar)
+        {
+            continue;
+        }
+#endif
+
+#if FEATURE_EH_FUNCLETS
+        // There's no need to force 0-initialization of the PSPSym, it will be
+        // initialized with a real value in the prolog
+        if (varNum == compiler->lvaPSPSym)
+        {
+            continue;
+        }
+#endif
+
+        if (compiler->lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+        {
+            // For Compiler::PROMOTION_TYPE_DEPENDENT type of promotion, the whole struct should have been
+            // initialized by the parent struct. No need to set the lvMustInit bit in the
+            // field locals.
+            continue;
+        }
+
+        if (compiler->info.compInitMem || varTypeIsGC(varDsc->TypeGet()) || (varDsc->lvStructGcCount > 0) ||
+            varDsc->lvMustInit)
+        {
+            if (varDsc->lvTracked)
+            {
+                /* For uninitialized use of tracked variables, the liveness
+                 * will bubble to the top (compiler->fgFirstBB) in fgInterBlockLocalVarLiveness()
+                 */
+                if (varDsc->lvMustInit ||
+                    VarSetOps::IsMember(compiler, compiler->fgFirstBB->bbLiveIn, varDsc->lvVarIndex))
+                {
+                    /* This var must be initialized */
+
+                    varDsc->lvMustInit = 1;
+
+                    /* See if the variable is on the stack will be initialized
+                     * using rep stos - compute the total size to be zero-ed */
+
+                    if (varDsc->lvOnFrame)
+                    {
+                        if (!varDsc->lvRegister)
+                        {
+#ifndef LEGACY_BACKEND
+                            if (!varDsc->lvIsInReg())
+#endif // !LEGACY_BACKEND
+                            {
+                                // Var is completely on the stack, in the legacy JIT case, or
+                                // on the stack at entry, in the RyuJIT case.
+                                initStkLclCnt += (unsigned)roundUp(compiler->lvaLclSize(varNum)) / sizeof(int);
+                            }
+                        }
+                        else
+                        {
+                            // Var is partially enregistered
+                            noway_assert(genTypeSize(varDsc->TypeGet()) > sizeof(int) && varDsc->lvOtherReg == REG_STK);
+                            initStkLclCnt += genTypeStSz(TYP_INT);
+                        }
+                    }
+                }
+            }
+
+            /* With compInitMem, all untracked vars will have to be init'ed */
+            /* VSW 102460 - Do not force initialization of compiler generated temps,
+                unless they are untracked GC type or structs that contain GC pointers */
+            CLANG_FORMAT_COMMENT_ANCHOR;
+
+#if FEATURE_SIMD
+            // TODO-1stClassStructs
+            // This is here to duplicate previous behavior, where TYP_SIMD8 locals
+            // were not being re-typed correctly.
+            if ((!varDsc->lvTracked || (varDsc->lvType == TYP_STRUCT) || (varDsc->lvType == TYP_SIMD8)) &&
+#else  // !FEATURE_SIMD
+            if ((!varDsc->lvTracked || (varDsc->lvType == TYP_STRUCT)) &&
+#endif // !FEATURE_SIMD
+                varDsc->lvOnFrame &&
+                (!varDsc->lvIsTemp || varTypeIsGC(varDsc->TypeGet()) || (varDsc->lvStructGcCount > 0)))
+            {
+                varDsc->lvMustInit = true;
+
+                initStkLclCnt += (unsigned)roundUp(compiler->lvaLclSize(varNum)) / sizeof(int);
+            }
+
+            continue;
+        }
+
+        /* Ignore if not a pointer variable or value class with a GC field */
+
+        if (!compiler->lvaTypeIsGC(varNum))
+        {
+            continue;
+        }
+
+#if CAN_DISABLE_DFA
+        /* If we don't know lifetimes of variables, must be conservative */
+
+        if (compiler->opts.MinOpts())
+        {
+            varDsc->lvMustInit = true;
+            noway_assert(!varDsc->lvRegister);
+        }
+        else
+#endif // CAN_DISABLE_DFA
+        {
+            if (!varDsc->lvTracked)
+            {
+                varDsc->lvMustInit = true;
+            }
+        }
+
+        /* Is this a 'must-init' stack pointer local? */
+
+        if (varDsc->lvMustInit && varDsc->lvOnFrame)
+        {
+            initStkLclCnt += varDsc->lvStructGcCount;
+        }
+
+        if ((compiler->lvaLclSize(varNum) > (3 * sizeof(void*))) && (largeGcStructs <= 4))
+        {
+            largeGcStructs++;
+        }
+    }
+
+    /* Don't forget about spill temps that hold pointers */
+
+    if (!TRACK_GC_TEMP_LIFETIMES)
+    {
+        assert(compiler->tmpAllFree());
+        for (TempDsc* tempThis = compiler->tmpListBeg(); tempThis != nullptr; tempThis = compiler->tmpListNxt(tempThis))
+        {
+            if (varTypeIsGC(tempThis->tdTempType()))
+            {
+                initStkLclCnt++;
+            }
+        }
+    }
+
+    // After debugging this further it was found that this logic is incorrect:
+    // it incorrectly assumes the stack slots are always 4 bytes (not necessarily the case)
+    // and this also double counts variables (we saw this in the debugger) around line 4829.
+    // Even though this doesn't pose a problem with correctness it will improperly decide to
+    // zero init the stack using a block operation instead of a 'case by case' basis.
+    genInitStkLclCnt = initStkLclCnt;
+
+    /* If we have more than 4 untracked locals, use block initialization */
+    /* TODO-Review: If we have large structs, bias toward not using block initialization since
+       we waste all the other slots.  Really need to compute the correct
+       and compare that against zeroing the slots individually */
+
+    genUseBlockInit = (genInitStkLclCnt > (largeGcStructs + 4));
+
+    if (genUseBlockInit)
+    {
+        regMaskTP maskCalleeRegArgMask = intRegState.rsCalleeRegArgMaskLiveIn;
+
+        // If there is a secret stub param, don't count it, as it will no longer
+        // be live when we do block init.
+        if (compiler->info.compPublishStubParam)
+        {
+            maskCalleeRegArgMask &= ~RBM_SECRET_STUB_PARAM;
+        }
+
+#ifdef _TARGET_XARCH_
+        // If we're going to use "REP STOS", remember that we will trash EDI
+        // For fastcall we will have to save ECX, EAX
+        // so reserve two extra callee saved
+        // This is better than pushing eax, ecx, because we in the later
+        // we will mess up already computed offsets on the stack (for ESP frames)
+        regSet.rsSetRegsModified(RBM_EDI);
+
+#ifdef UNIX_AMD64_ABI
+        // For register arguments we may have to save ECX (and RDI on Amd64 System V OSes.)
+        // In such case use R12 and R13 registers.
+        if (maskCalleeRegArgMask & RBM_RCX)
+        {
+            regSet.rsSetRegsModified(RBM_R12);
+        }
+
+        if (maskCalleeRegArgMask & RBM_RDI)
+        {
+            regSet.rsSetRegsModified(RBM_R13);
+        }
+#else  // !UNIX_AMD64_ABI
+        if (maskCalleeRegArgMask & RBM_ECX)
+        {
+            regSet.rsSetRegsModified(RBM_ESI);
+        }
+#endif // !UNIX_AMD64_ABI
+
+        if (maskCalleeRegArgMask & RBM_EAX)
+        {
+            regSet.rsSetRegsModified(RBM_EBX);
+        }
+
+#endif // _TARGET_XARCH_
+#ifdef _TARGET_ARM_
+        //
+        // On the Arm if we are using a block init to initialize, then we
+        // must force spill R4/R5/R6 so that we can use them during
+        // zero-initialization process.
+        //
+        int forceSpillRegCount = genCountBits(maskCalleeRegArgMask & ~regSet.rsMaskPreSpillRegs(false)) - 1;
+        if (forceSpillRegCount > 0)
+            regSet.rsSetRegsModified(RBM_R4);
+        if (forceSpillRegCount > 1)
+            regSet.rsSetRegsModified(RBM_R5);
+        if (forceSpillRegCount > 2)
+            regSet.rsSetRegsModified(RBM_R6);
+#endif // _TARGET_ARM_
+    }
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ *  Push any callee-saved registers we have used
+ */
+
+#if defined(_TARGET_ARM64_)
+void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroed)
+#else
+void          CodeGen::genPushCalleeSavedRegisters()
+#endif
+{
+    assert(compiler->compGeneratingProlog);
+
+#if defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+    // x86/x64 doesn't support push of xmm/ymm regs, therefore consider only integer registers for pushing onto stack
+    // here. Space for float registers to be preserved is stack allocated and saved as part of prolog sequence and not
+    // here.
+    regMaskTP rsPushRegs = regSet.rsGetModifiedRegsMask() & RBM_INT_CALLEE_SAVED;
+#else // !defined(_TARGET_XARCH_) || FEATURE_STACK_FP_X87
+    regMaskTP rsPushRegs = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED;
+#endif
+
+#if ETW_EBP_FRAMED
+    if (!isFramePointerUsed() && regSet.rsRegsModified(RBM_FPBASE))
+    {
+        noway_assert(!"Used register RBM_FPBASE as a scratch register!");
+    }
+#endif
+
+#ifdef _TARGET_XARCH_
+    // On X86/X64 we have already pushed the FP (frame-pointer) prior to calling this method
+    if (isFramePointerUsed())
+    {
+        rsPushRegs &= ~RBM_FPBASE;
+    }
+#endif
+
+#ifdef _TARGET_ARMARCH_
+    // On ARM we push the FP (frame-pointer) here along with all other callee saved registers
+    if (isFramePointerUsed())
+        rsPushRegs |= RBM_FPBASE;
+
+    //
+    // It may be possible to skip pushing/popping lr for leaf methods. However, such optimization would require
+    // changes in GC suspension architecture.
+    //
+    // We would need to guarantee that a tight loop calling a virtual leaf method can be suspended for GC. Today, we
+    // generate partially interruptible code for both the method that contains the tight loop with the call and the leaf
+    // method. GC suspension depends on return address hijacking in this case. Return address hijacking depends
+    // on the return address to be saved on the stack. If we skipped pushing/popping lr, the return address would never
+    // be saved on the stack and the GC suspension would time out.
+    //
+    // So if we wanted to skip pushing pushing/popping lr for leaf frames, we would also need to do one of
+    // the following to make GC suspension work in the above scenario:
+    // - Make return address hijacking work even when lr is not saved on the stack.
+    // - Generate fully interruptible code for loops that contains calls
+    // - Generate fully interruptible code for leaf methods
+    //
+    // Given the limited benefit from this optimization (<10k for mscorlib NGen image), the extra complexity
+    // is not worth it.
+    //
+    rsPushRegs |= RBM_LR; // We must save the return address (in the LR register)
+
+    regSet.rsMaskCalleeSaved = rsPushRegs;
+#endif // _TARGET_ARMARCH_
+
+#ifdef DEBUG
+    if (compiler->compCalleeRegsPushed != genCountBits(rsPushRegs))
+    {
+        printf("Error: unexpected number of callee-saved registers to push. Expected: %d. Got: %d ",
+               compiler->compCalleeRegsPushed, genCountBits(rsPushRegs));
+        dspRegMask(rsPushRegs);
+        printf("\n");
+        assert(compiler->compCalleeRegsPushed == genCountBits(rsPushRegs));
+    }
+#endif // DEBUG
+
+#if defined(_TARGET_ARM_)
+    regMaskTP maskPushRegsFloat = rsPushRegs & RBM_ALLFLOAT;
+    regMaskTP maskPushRegsInt   = rsPushRegs & ~maskPushRegsFloat;
+
+    maskPushRegsInt |= genStackAllocRegisterMask(compiler->compLclFrameSize, maskPushRegsFloat);
+
+    assert(FitsIn<int>(maskPushRegsInt));
+    inst_IV(INS_push, (int)maskPushRegsInt);
+    compiler->unwindPushMaskInt(maskPushRegsInt);
+
+    if (maskPushRegsFloat != 0)
+    {
+        genPushFltRegs(maskPushRegsFloat);
+        compiler->unwindPushMaskFloat(maskPushRegsFloat);
+    }
+#elif defined(_TARGET_ARM64_)
+    // See the document "ARM64 JIT Frame Layout" and/or "ARM64 Exception Data" for more details or requirements and
+    // options. Case numbers in comments here refer to this document.
+    //
+    // For most frames, generate, e.g.:
+    //      stp fp,  lr,  [sp,-0x80]!   // predecrement SP with full frame size, and store FP/LR pair. Store pair
+    //                                  // ensures stack stays aligned.
+    //      stp r19, r20, [sp, 0x60]    // store at positive offset from SP established above, into callee-saved area
+    //                                  // at top of frame (highest addresses).
+    //      stp r21, r22, [sp, 0x70]
+    //
+    // Notes:
+    // 1. We don't always need to save FP. If FP isn't saved, then LR is saved with the other callee-saved registers
+    //    at the top of the frame.
+    // 2. If we save FP, then the first store is FP, LR.
+    // 3. General-purpose registers are 8 bytes, floating-point registers are 16 bytes, but FP/SIMD registers only
+    //    preserve their lower 8 bytes, by calling convention.
+    // 4. For frames with varargs, we spill the integer register arguments to the stack, so all the arguments are
+    //    consecutive.
+    // 5. We allocate the frame here; no further changes to SP are allowed (except in the body, for localloc).
+
+    int totalFrameSize = genTotalFrameSize();
+
+    int offset; // This will be the starting place for saving the callee-saved registers, in increasing order.
+
+    regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_ALLFLOAT;
+    regMaskTP maskSaveRegsInt   = rsPushRegs & ~maskSaveRegsFloat;
+
+    if (compiler->info.compIsVarArgs)
+    {
+        assert(maskSaveRegsFloat == RBM_NONE);
+    }
+
+    int frameType = 0; // This number is arbitrary, is defined below, and corresponds to one of the frame styles we
+                       // generate based on various sizes.
+    int calleeSaveSPDelta          = 0;
+    int calleeSaveSPDeltaUnaligned = 0;
+
+    if (isFramePointerUsed())
+    {
+        // We need to save both FP and LR.
+
+        assert((maskSaveRegsInt & RBM_FP) != 0);
+        assert((maskSaveRegsInt & RBM_LR) != 0);
+
+        if ((compiler->lvaOutgoingArgSpaceSize == 0) && (totalFrameSize < 512))
+        {
+            // Case #1.
+            //
+            // Generate:
+            //      stp fp,lr,[sp,#-framesz]!
+            //
+            // The (totalFrameSize < 512) condition ensures that both the predecrement
+            //  and the postincrement of SP can occur with STP.
+            //
+            // After saving callee-saved registers, we establish the frame pointer with:
+            //      mov fp,sp
+            // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match.
+
+            frameType = 1;
+
+            getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, -totalFrameSize,
+                                          INS_OPTS_PRE_INDEX);
+            compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, -totalFrameSize);
+
+            maskSaveRegsInt &= ~(RBM_FP | RBM_LR);                        // We've already saved FP/LR
+            offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR
+        }
+        else if (totalFrameSize <= 512)
+        {
+            // Case #2.
+            //
+            // Generate:
+            //      sub sp,sp,#framesz
+            //      stp fp,lr,[sp,#outsz]   // note that by necessity, #outsz <= #framesz - 16, so #outsz <= 496.
+            //
+            // The (totalFrameSize <= 512) condition ensures the callee-saved registers can all be saved using STP with
+            // signed offset encoding.
+            //
+            // After saving callee-saved registers, we establish the frame pointer with:
+            //      add fp,sp,#outsz
+            // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match.
+
+            frameType = 2;
+
+            assert(compiler->lvaOutgoingArgSpaceSize + 2 * REGSIZE_BYTES <= (unsigned)totalFrameSize);
+
+            getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, totalFrameSize);
+            compiler->unwindAllocStack(totalFrameSize);
+
+            getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE,
+                                          compiler->lvaOutgoingArgSpaceSize);
+            compiler->unwindSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize);
+
+            maskSaveRegsInt &= ~(RBM_FP | RBM_LR);                        // We've already saved FP/LR
+            offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR
+        }
+        else
+        {
+            // Case 5 or 6.
+            //
+            // First, the callee-saved registers will be saved, and the callee-saved register code must use pre-index
+            // to subtract from SP as the first instruction. It must also leave space for varargs registers to be
+            // stored. For example:
+            //      stp r19,r20,[sp,#-96]!
+            //      stp d8,d9,[sp,#16]
+            //      ... save varargs incoming integer registers ...
+            // Note that all SP alterations must be 16-byte aligned. We have already calculated any alignment to be
+            // lower on the stack than the callee-saved registers (see lvaAlignFrame() for how we calculate alignment).
+            // So, if there is an odd number of callee-saved registers, we use (for example, with just one saved
+            // register):
+            //      sub sp,sp,#16
+            //      str r19,[sp,#8]
+            // This is one additional instruction, but it centralizes the aligned space. Otherwise, it might be
+            // possible to have two 8-byte alignment padding words, one below the callee-saved registers, and one
+            // above them. If that is preferable, we could implement it.
+            // Note that any varargs saved space will always be 16-byte aligned, since there are 8 argument registers.
+            //
+            // Then, define #remainingFrameSz = #framesz - (callee-saved size + varargs space + possible alignment
+            // padding from above).
+            // Note that #remainingFrameSz must not be zero, since we still need to save FP,SP.
+            //
+            // Generate:
+            //      sub sp,sp,#remainingFrameSz
+            // or, for large frames:
+            //      mov rX, #remainingFrameSz // maybe multiple instructions
+            //      sub sp,sp,rX
+            //
+            // followed by:
+            //      stp fp,lr,[sp,#outsz]
+            //      add fp,sp,#outsz
+            //
+            // However, we need to handle the case where #outsz is larger than the constant signed offset encoding can
+            // handle. And, once again, we might need to deal with #outsz that is not aligned to 16-bytes (i.e.,
+            // STACK_ALIGN). So, in the case of large #outsz we will have an additional SP adjustment, using one of the
+            // following sequences:
+            //
+            // Define #remainingFrameSz2 = #remainingFrameSz - #outsz.
+            //
+            //      sub sp,sp,#remainingFrameSz2  // if #remainingFrameSz2 is 16-byte aligned
+            //      stp fp,lr,[sp]
+            //      mov fp,sp
+            //      sub sp,sp,#outsz    // in this case, #outsz must also be 16-byte aligned
+            //
+            // Or:
+            //
+            //      sub sp,sp,roundUp(#remainingFrameSz2,16)    // if #remainingFrameSz2 is not 16-byte aligned (it is
+            //                                                  // always guaranteed to be 8 byte aligned).
+            //      stp fp,lr,[sp,#8]                           // it will always be #8 in the unaligned case
+            //      add fp,sp,#8
+            //      sub sp,sp,#outsz - #8
+            //
+            // (As usual, for a large constant "#outsz - #8", we might need multiple instructions:
+            //      mov rX, #outsz - #8 // maybe multiple instructions
+            //      sub sp,sp,rX
+            // )
+
+            frameType = 3;
+
+            calleeSaveSPDeltaUnaligned =
+                totalFrameSize - compiler->compLclFrameSize - 2 * REGSIZE_BYTES; // 2 for FP, LR which we'll save later.
+            assert(calleeSaveSPDeltaUnaligned >= 0);
+            assert((calleeSaveSPDeltaUnaligned % 8) == 0); // It better at least be 8 byte aligned.
+            calleeSaveSPDelta = AlignUp((UINT)calleeSaveSPDeltaUnaligned, STACK_ALIGN);
+
+            offset = calleeSaveSPDelta - calleeSaveSPDeltaUnaligned;
+            assert((offset == 0) || (offset == REGSIZE_BYTES)); // At most one alignment slot between SP and where we
+                                                                // store the callee-saved registers.
+
+            // We'll take care of these later, but callee-saved regs code shouldn't see them.
+            maskSaveRegsInt &= ~(RBM_FP | RBM_LR);
+        }
+    }
+    else
+    {
+        // No frame pointer (no chaining).
+        assert((maskSaveRegsInt & RBM_FP) == 0);
+        assert((maskSaveRegsInt & RBM_LR) != 0);
+
+        // Note that there is no pre-indexed save_lrpair unwind code variant, so we can't allocate the frame using 'stp'
+        // if we only have one callee-saved register plus LR to save.
+
+        NYI("Frame without frame pointer");
+        offset = 0;
+    }
+
+    assert(frameType != 0);
+
+    genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, offset, -calleeSaveSPDelta);
+
+    offset += genCountBits(maskSaveRegsInt | maskSaveRegsFloat) * REGSIZE_BYTES;
+
+    // For varargs, home the incoming arg registers last. Note that there is nothing to unwind here,
+    // so we just report "NOP" unwind codes. If there's no more frame setup after this, we don't
+    // need to add codes at all.
+
+    if (compiler->info.compIsVarArgs)
+    {
+        // There are 8 general-purpose registers to home, thus 'offset' must be 16-byte aligned here.
+        assert((offset % 16) == 0);
+        for (regNumber reg1 = REG_ARG_FIRST; reg1 < REG_ARG_LAST; reg1 = REG_NEXT(REG_NEXT(reg1)))
+        {
+            regNumber reg2 = REG_NEXT(reg1);
+            // stp REG, REG + 1, [SP, #offset]
+            getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, offset);
+            compiler->unwindNop();
+            offset += 2 * REGSIZE_BYTES;
+        }
+    }
+
+    if (frameType == 1)
+    {
+        getEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_FPBASE, REG_SPBASE);
+        compiler->unwindSetFrameReg(REG_FPBASE, 0);
+    }
+    else if (frameType == 2)
+    {
+        getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_FPBASE, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize);
+        compiler->unwindSetFrameReg(REG_FPBASE, compiler->lvaOutgoingArgSpaceSize);
+    }
+    else if (frameType == 3)
+    {
+        int remainingFrameSz = totalFrameSize - calleeSaveSPDelta;
+        assert(remainingFrameSz > 0);
+        assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component --
+                                              // totalFrameSize and calleeSaveSPDelta -- is 16-byte aligned.
+
+        if (compiler->lvaOutgoingArgSpaceSize >= 504)
+        {
+            // We can't do "stp fp,lr,[sp,#outsz]" because #outsz is too big.
+            // If compiler->lvaOutgoingArgSpaceSize is not aligned, we need to align the SP adjustment.
+            assert(remainingFrameSz > (int)compiler->lvaOutgoingArgSpaceSize);
+            int spAdjustment2Unaligned = remainingFrameSz - compiler->lvaOutgoingArgSpaceSize;
+            int spAdjustment2          = (int)roundUp((size_t)spAdjustment2Unaligned, STACK_ALIGN);
+            int alignmentAdjustment2   = spAdjustment2 - spAdjustment2Unaligned;
+            assert((alignmentAdjustment2 == 0) || (alignmentAdjustment2 == 8));
+
+            genPrologSaveRegPair(REG_FP, REG_LR, alignmentAdjustment2, -spAdjustment2, false, initReg, pInitRegZeroed);
+            offset += spAdjustment2;
+
+            // Now subtract off the #outsz (or the rest of the #outsz if it was unaligned, and the above "sub" included
+            // some of it)
+
+            int spAdjustment3 = compiler->lvaOutgoingArgSpaceSize - alignmentAdjustment2;
+            assert(spAdjustment3 > 0);
+            assert((spAdjustment3 % 16) == 0);
+
+            getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_FPBASE, REG_SPBASE, alignmentAdjustment2);
+            compiler->unwindSetFrameReg(REG_FPBASE, alignmentAdjustment2);
+
+            genStackPointerAdjustment(-spAdjustment3, initReg, pInitRegZeroed);
+            offset += spAdjustment3;
+        }
+        else
+        {
+            genPrologSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize, -remainingFrameSz, false, initReg,
+                                 pInitRegZeroed);
+            offset += remainingFrameSz;
+
+            getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_FPBASE, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize);
+            compiler->unwindSetFrameReg(REG_FPBASE, compiler->lvaOutgoingArgSpaceSize);
+        }
+    }
+
+    assert(offset == totalFrameSize);
+
+#elif defined(_TARGET_XARCH_)
+    // Push backwards so we match the order we will pop them in the epilog
+    // and all the other code that expects it to be in this order.
+    for (regNumber reg = REG_INT_LAST; rsPushRegs != RBM_NONE; reg = REG_PREV(reg))
+    {
+        regMaskTP regBit = genRegMask(reg);
+
+        if ((regBit & rsPushRegs) != 0)
+        {
+            inst_RV(INS_push, reg, TYP_REF);
+            compiler->unwindPush(reg);
+
+            if (!doubleAlignOrFramePointerUsed())
+            {
+                psiAdjustStackLevel(REGSIZE_BYTES);
+            }
+
+            rsPushRegs &= ~regBit;
+        }
+    }
+
+#else
+    assert(!"Unknown TARGET");
+#endif // _TARGET_*
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ *  Probe the stack and allocate the local stack frame: subtract from SP.
+ *  On ARM64, this only does the probing; allocating the frame is done when callee-saved registers are saved.
+ */
+
+void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn)
+{
+    assert(compiler->compGeneratingProlog);
+
+    if (frameSize == 0)
+    {
+        return;
+    }
+
+    const size_t pageSize = compiler->eeGetPageSize();
+
+#ifdef _TARGET_ARM_
+    assert(!compiler->info.compPublishStubParam || (REG_SECRET_STUB_PARAM != initReg));
+#endif // _TARGET_ARM_
+
+#ifdef _TARGET_XARCH_
+    if (frameSize == REGSIZE_BYTES)
+    {
+        // Frame size is the same as register size.
+        inst_RV(INS_push, REG_EAX, TYP_I_IMPL);
+    }
+    else
+#endif // _TARGET_XARCH_
+        if (frameSize < pageSize)
+    {
+#ifndef _TARGET_ARM64_
+        // Frame size is (0x0008..0x1000)
+        inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
+#endif // !_TARGET_ARM64_
+    }
+    else if (frameSize < compiler->getVeryLargeFrameSize())
+    {
+        // Frame size is (0x1000..0x3000)
+        CLANG_FORMAT_COMMENT_ANCHOR;
+
+#if CPU_LOAD_STORE_ARCH
+        instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)pageSize);
+        getEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, initReg, REG_SPBASE, initReg);
+        regTracker.rsTrackRegTrash(initReg);
+        *pInitRegZeroed = false; // The initReg does not contain zero
+#else
+        getEmitter()->emitIns_AR_R(INS_TEST, EA_PTRSIZE, REG_EAX, REG_SPBASE, -(int)pageSize);
+#endif
+
+        if (frameSize >= 0x2000)
+        {
+#if CPU_LOAD_STORE_ARCH
+            instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -2 * (ssize_t)pageSize);
+            getEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, initReg, REG_SPBASE, initReg);
+            regTracker.rsTrackRegTrash(initReg);
+#else
+            getEmitter()->emitIns_AR_R(INS_TEST, EA_PTRSIZE, REG_EAX, REG_SPBASE, -2 * (int)pageSize);
+#endif
+        }
+
+#ifdef _TARGET_ARM64_
+        compiler->unwindPadding();
+#else // !_TARGET_ARM64_
+#if CPU_LOAD_STORE_ARCH
+        instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, frameSize);
+        compiler->unwindPadding();
+        getEmitter()->emitIns_R_R_R(INS_sub, EA_4BYTE, REG_SPBASE, REG_SPBASE, initReg);
+#else
+        inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
+#endif
+#endif // !_TARGET_ARM64_
+    }
+    else
+    {
+        // Frame size >= 0x3000
+        assert(frameSize >= compiler->getVeryLargeFrameSize());
+
+        // Emit the following sequence to 'tickle' the pages.
+        // Note it is important that stack pointer not change until this is
+        // complete since the tickles could cause a stack overflow, and we
+        // need to be able to crawl the stack afterward (which means the
+        // stack pointer needs to be known).
+        CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef _TARGET_XARCH_
+        bool pushedStubParam = false;
+        if (compiler->info.compPublishStubParam && (REG_SECRET_STUB_PARAM == initReg))
+        {
+            // push register containing the StubParam
+            inst_RV(INS_push, REG_SECRET_STUB_PARAM, TYP_I_IMPL);
+            pushedStubParam = true;
+        }
+#endif // !_TARGET_XARCH_
+
+        instGen_Set_Reg_To_Zero(EA_PTRSIZE, initReg);
+
+        //
+        // Can't have a label inside the ReJIT padding area
+        //
+        genPrologPadForReJit();
+
+#if CPU_LOAD_STORE_ARCH
+
+        // TODO-ARM64-Bug?: set the availMask properly!
+        regMaskTP availMask =
+            (regSet.rsGetModifiedRegsMask() & RBM_ALLINT) | RBM_R12 | RBM_LR; // Set of available registers
+        availMask &= ~maskArgRegsLiveIn;   // Remove all of the incoming argument registers as they are currently live
+        availMask &= ~genRegMask(initReg); // Remove the pre-calculated initReg
+
+        regNumber rOffset = initReg;
+        regNumber rLimit;
+        regNumber rTemp;
+        regMaskTP tempMask;
+
+        // We pick the next lowest register number for rTemp
+        noway_assert(availMask != RBM_NONE);
+        tempMask = genFindLowestBit(availMask);
+        rTemp    = genRegNumFromMask(tempMask);
+        availMask &= ~tempMask;
+
+        // We pick the next lowest register number for rLimit
+        noway_assert(availMask != RBM_NONE);
+        tempMask = genFindLowestBit(availMask);
+        rLimit   = genRegNumFromMask(tempMask);
+        availMask &= ~tempMask;
+
+        // TODO-LdStArch-Bug?: review this. The first time we load from [sp+0] which will always succeed. That doesn't
+        // make sense.
+        // TODO-ARM64-CQ: we could probably use ZR on ARM64 instead of rTemp.
+        //
+        //      mov rLimit, -frameSize
+        // loop:
+        //      ldr rTemp, [sp+rOffset]
+        //      sub rOffset, 0x1000     // Note that 0x1000 on ARM32 uses the funky Thumb immediate encoding
+        //      cmp rOffset, rLimit
+        //      jge loop
+        noway_assert((ssize_t)(int)frameSize == (ssize_t)frameSize); // make sure framesize safely fits within an int
+        instGen_Set_Reg_To_Imm(EA_PTRSIZE, rLimit, -(int)frameSize);
+        getEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, rTemp, REG_SPBASE, rOffset);
+        regTracker.rsTrackRegTrash(rTemp);
+#if defined(_TARGET_ARM_)
+        getEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, rOffset, pageSize);
+#elif defined(_TARGET_ARM64_)
+        getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, rOffset, rOffset, pageSize);
+#endif // _TARGET_ARM64_
+        getEmitter()->emitIns_R_R(INS_cmp, EA_PTRSIZE, rOffset, rLimit);
+        getEmitter()->emitIns_J(INS_bhi, NULL, -4);
+
+#else // !CPU_LOAD_STORE_ARCH
+
+        // Code size for each instruction. We need this because the
+        // backward branch is hard-coded with the number of bytes to branch.
+        // The encoding differs based on the architecture and what register is
+        // used (namely, using RAX has a smaller encoding).
+        //
+        // loop:
+        // For x86
+        //      test [esp + eax], eax       3
+        //      sub eax, 0x1000             5
+        //      cmp EAX, -frameSize         5
+        //      jge loop                    2
+        //
+        // For AMD64 using RAX
+        //      test [rsp + rax], rax       4
+        //      sub rax, 0x1000             6
+        //      cmp rax, -frameSize         6
+        //      jge loop                    2
+        //
+        // For AMD64 using RBP
+        //      test [rsp + rbp], rbp       4
+        //      sub rbp, 0x1000             7
+        //      cmp rbp, -frameSize         7
+        //      jge loop                    2
+
+        getEmitter()->emitIns_R_ARR(INS_TEST, EA_PTRSIZE, initReg, REG_SPBASE, initReg, 0);
+        inst_RV_IV(INS_sub, initReg, pageSize, EA_PTRSIZE);
+        inst_RV_IV(INS_cmp, initReg, -((ssize_t)frameSize), EA_PTRSIZE);
+
+        int bytesForBackwardJump;
+#ifdef _TARGET_AMD64_
+        assert((initReg == REG_EAX) || (initReg == REG_EBP)); // We use RBP as initReg for EH funclets.
+        bytesForBackwardJump = ((initReg == REG_EAX) ? -18 : -20);
+#else  // !_TARGET_AMD64_
+        assert(initReg == REG_EAX);
+        bytesForBackwardJump = -15;
+#endif // !_TARGET_AMD64_
+
+        inst_IV(INS_jge, bytesForBackwardJump); // Branch backwards to start of loop
+
+#endif // !CPU_LOAD_STORE_ARCH
+
+        *pInitRegZeroed = false; // The initReg does not contain zero
+
+#ifdef _TARGET_XARCH_
+        if (pushedStubParam)
+        {
+            // pop eax
+            inst_RV(INS_pop, REG_SECRET_STUB_PARAM, TYP_I_IMPL);
+            regTracker.rsTrackRegTrash(REG_SECRET_STUB_PARAM);
+        }
+#endif // _TARGET_XARCH_
+
+#if CPU_LOAD_STORE_ARCH
+        compiler->unwindPadding();
+#endif
+
+#if CPU_LOAD_STORE_ARCH
+#ifndef _TARGET_ARM64_
+        inst_RV_RV(INS_add, REG_SPBASE, rLimit, TYP_I_IMPL);
+#endif // !_TARGET_ARM64_
+#else
+        //      sub esp, frameSize   6
+        inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
+#endif
+    }
+
+#ifndef _TARGET_ARM64_
+    compiler->unwindAllocStack(frameSize);
+
+    if (!doubleAlignOrFramePointerUsed())
+    {
+        psiAdjustStackLevel(frameSize);
+    }
+#endif // !_TARGET_ARM64_
+}
+
+#if defined(_TARGET_ARM_)
+
+void CodeGen::genPushFltRegs(regMaskTP regMask)
+{
+    assert(regMask != 0);                        // Don't call uness we have some registers to push
+    assert((regMask & RBM_ALLFLOAT) == regMask); // Only floasting point registers should be in regMask
+
+    regNumber lowReg = genRegNumFromMask(genFindLowestBit(regMask));
+    int       slots  = genCountBits(regMask);
+    // regMask should be contiguously set
+    regMaskTP tmpMask = ((regMask >> lowReg) + 1); // tmpMask should have a single bit set
+    assert((tmpMask & (tmpMask - 1)) == 0);
+    assert(lowReg == REG_F16); // Currently we expect to start at F16 in the unwind codes
+
+    // Our calling convention requires that we only use vpush for TYP_DOUBLE registers
+    noway_assert(floatRegCanHoldType(lowReg, TYP_DOUBLE));
+    noway_assert((slots % 2) == 0);
+
+    getEmitter()->emitIns_R_I(INS_vpush, EA_8BYTE, lowReg, slots / 2);
+}
+
+void CodeGen::genPopFltRegs(regMaskTP regMask)
+{
+    assert(regMask != 0);                        // Don't call uness we have some registers to pop
+    assert((regMask & RBM_ALLFLOAT) == regMask); // Only floasting point registers should be in regMask
+
+    regNumber lowReg = genRegNumFromMask(genFindLowestBit(regMask));
+    int       slots  = genCountBits(regMask);
+    // regMask should be contiguously set
+    regMaskTP tmpMask = ((regMask >> lowReg) + 1); // tmpMask should have a single bit set
+    assert((tmpMask & (tmpMask - 1)) == 0);
+
+    // Our calling convention requires that we only use vpop for TYP_DOUBLE registers
+    noway_assert(floatRegCanHoldType(lowReg, TYP_DOUBLE));
+    noway_assert((slots % 2) == 0);
+
+    getEmitter()->emitIns_R_I(INS_vpop, EA_8BYTE, lowReg, slots / 2);
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ *  If we have a jmp call, then the argument registers cannot be used in the
+ *  epilog. So return the current call's argument registers as the argument
+ *  registers for the jmp call.
+ */
+regMaskTP CodeGen::genJmpCallArgMask()
+{
+    assert(compiler->compGeneratingEpilog);
+
+    regMaskTP argMask = RBM_NONE;
+    for (unsigned varNum = 0; varNum < compiler->info.compArgsCount; ++varNum)
+    {
+        const LclVarDsc& desc = compiler->lvaTable[varNum];
+        if (desc.lvIsRegArg)
+        {
+            argMask |= genRegMask(desc.lvArgReg);
+        }
+    }
+    return argMask;
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ *  Free the local stack frame: add to SP.
+ *  If epilog unwind hasn't been started, and we generate code, we start unwind
+ *  and set *pUnwindStarted = true.
+ */
+
+void CodeGen::genFreeLclFrame(unsigned frameSize, /* IN OUT */ bool* pUnwindStarted, bool jmpEpilog)
+{
+    assert(compiler->compGeneratingEpilog);
+
+    if (frameSize == 0)
+        return;
+
+    // Add 'frameSize' to SP.
+    //
+    // Unfortunately, we can't just use:
+    //
+    //      inst_RV_IV(INS_add, REG_SPBASE, frameSize, EA_PTRSIZE);
+    //
+    // because we need to generate proper unwind codes for each instruction generated,
+    // and large frame sizes might generate a temp register load which might
+    // need an unwind code. We don't want to generate a "NOP" code for this
+    // temp register load; we want the unwind codes to start after that.
+
+    if (arm_Valid_Imm_For_Instr(INS_add, frameSize, INS_FLAGS_DONT_CARE))
+    {
+        if (!*pUnwindStarted)
+        {
+            compiler->unwindBegEpilog();
+            *pUnwindStarted = true;
+        }
+
+        getEmitter()->emitIns_R_I(INS_add, EA_PTRSIZE, REG_SPBASE, frameSize, INS_FLAGS_DONT_CARE);
+    }
+    else
+    {
+        regMaskTP grabMask = RBM_INT_CALLEE_TRASH;
+        if (jmpEpilog)
+        {
+            // Do not use argument registers as scratch registers in the jmp epilog.
+            grabMask &= ~genJmpCallArgMask();
+        }
+#ifndef LEGACY_BACKEND
+        regNumber tmpReg;
+        tmpReg = REG_TMP_0;
+#else  // LEGACY_BACKEND
+        regNumber tmpReg = regSet.rsGrabReg(grabMask);
+#endif // LEGACY_BACKEND
+        instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, frameSize);
+        if (*pUnwindStarted)
+        {
+            compiler->unwindPadding();
+        }
+
+        // We're going to generate an unwindable instruction, so check again if
+        // we need to start the unwind codes.
+
+        if (!*pUnwindStarted)
+        {
+            compiler->unwindBegEpilog();
+            *pUnwindStarted = true;
+        }
+
+        getEmitter()->emitIns_R_R(INS_add, EA_PTRSIZE, REG_SPBASE, tmpReg, INS_FLAGS_DONT_CARE);
+    }
+
+    compiler->unwindAllocStack(frameSize);
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ *  Returns register mask to push/pop to allocate a small stack frame,
+ *  instead of using "sub sp" / "add sp". Returns RBM_NONE if either frame size
+ *  is zero, or if we should use "sub sp" / "add sp" instead of push/pop.
+ */
+regMaskTP CodeGen::genStackAllocRegisterMask(unsigned frameSize, regMaskTP maskCalleeSavedFloat)
+{
+    assert(compiler->compGeneratingProlog || compiler->compGeneratingEpilog);
+
+    // We can't do this optimization with callee saved floating point registers because
+    // the stack would be allocated in a wrong spot.
+    if (maskCalleeSavedFloat != RBM_NONE)
+        return RBM_NONE;
+
+    // Allocate space for small frames by pushing extra registers. It generates smaller and faster code
+    // that extra sub sp,XXX/add sp,XXX.
+    // R0 and R1 may be used by return value. Keep things simple and just skip the optimization
+    // for the 3*REGSIZE_BYTES and 4*REGSIZE_BYTES cases. They are less common and they have more
+    // significant negative side-effects (more memory bus traffic).
+    switch (frameSize)
+    {
+        case REGSIZE_BYTES:
+            return RBM_R3;
+        case 2 * REGSIZE_BYTES:
+            return RBM_R2 | RBM_R3;
+        default:
+            return RBM_NONE;
+    }
+}
+
+#endif // _TARGET_ARM_
+
+#if !FEATURE_STACK_FP_X87
+
+/*****************************************************************************
+ *
+ *  initFltRegs -- The mask of float regs to be zeroed.
+ *  initDblRegs -- The mask of double regs to be zeroed.
+ *  initReg -- A zero initialized integer reg to copy from.
+ *
+ *  Does best effort to move between VFP/xmm regs if one is already
+ *  initialized to 0. (Arm Only) Else copies from the integer register which
+ *  is slower.
+ */
+void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP& initDblRegs, const regNumber& initReg)
+{
+    assert(compiler->compGeneratingProlog);
+
+    // The first float/double reg that is initialized to 0. So they can be used to
+    // initialize the remaining registers.
+    regNumber fltInitReg = REG_NA;
+    regNumber dblInitReg = REG_NA;
+
+    // Iterate through float/double registers and initialize them to 0 or
+    // copy from already initialized register of the same type.
+    regMaskTP regMask = genRegMask(REG_FP_FIRST);
+    for (regNumber reg = REG_FP_FIRST; reg <= REG_FP_LAST; reg = REG_NEXT(reg), regMask <<= 1)
+    {
+        if (regMask & initFltRegs)
+        {
+            // Do we have a float register already set to 0?
+            if (fltInitReg != REG_NA)
+            {
+                // Copy from float.
+                inst_RV_RV(ins_Copy(TYP_FLOAT), reg, fltInitReg, TYP_FLOAT);
+            }
+            else
+            {
+#ifdef _TARGET_ARM_
+                // Do we have a double register initialized to 0?
+                if (dblInitReg != REG_NA)
+                {
+                    // Copy from double.
+                    inst_RV_RV(INS_vcvt_d2f, reg, dblInitReg, TYP_FLOAT);
+                }
+                else
+                {
+                    // Copy from int.
+                    inst_RV_RV(INS_vmov_i2f, reg, initReg, TYP_FLOAT, EA_4BYTE);
+                }
+#elif defined(_TARGET_XARCH_)
+                // Xorpd xmmreg, xmmreg is the fastest way to initialize a float register to
+                // zero instead of moving constant 0.0f.  Though we just need to initialize just the 32-bits
+                // we will use xorpd to initialize 64-bits of the xmm register so that it can be
+                // used to zero initialize xmm registers that hold double values.
+                inst_RV_RV(INS_xorpd, reg, reg, TYP_DOUBLE);
+                dblInitReg = reg;
+#elif defined(_TARGET_ARM64_)
+                NYI("Initialize floating-point register to zero");
+#else // _TARGET_*
+#error Unsupported or unset target architecture
+#endif
+                fltInitReg = reg;
+            }
+        }
+        else if (regMask & initDblRegs)
+        {
+            // Do we have a double register already set to 0?
+            if (dblInitReg != REG_NA)
+            {
+                // Copy from double.
+                inst_RV_RV(ins_Copy(TYP_DOUBLE), reg, dblInitReg, TYP_DOUBLE);
+            }
+            else
+            {
+#ifdef _TARGET_ARM_
+                // Do we have a float register initialized to 0?
+                if (fltInitReg != REG_NA)
+                {
+                    // Copy from float.
+                    inst_RV_RV(INS_vcvt_f2d, reg, fltInitReg, TYP_DOUBLE);
+                }
+                else
+                {
+                    // Copy from int.
+                    inst_RV_RV_RV(INS_vmov_i2d, reg, initReg, initReg, EA_8BYTE);
+                }
+#elif defined(_TARGET_XARCH_)
+                // Xorpd xmmreg, xmmreg is the fastest way to initialize a double register to
+                // zero than moving constant 0.0d.  We can also use lower 32-bits of 'reg'
+                // for zero initializing xmm registers subsequently that contain float values.
+                inst_RV_RV(INS_xorpd, reg, reg, TYP_DOUBLE);
+                fltInitReg = reg;
+#elif defined(_TARGET_ARM64_)
+                // We will just zero out the entire vector register. This sets it to a double zero value
+                getEmitter()->emitIns_R_I(INS_movi, EA_16BYTE, reg, 0x00, INS_OPTS_16B);
+#else // _TARGET_*
+#error Unsupported or unset target architecture
+#endif
+                dblInitReg = reg;
+            }
+        }
+    }
+}
+#endif // !FEATURE_STACK_FP_X87
+
+/*-----------------------------------------------------------------------------
+ *
+ *  Restore any callee-saved registers we have used
+ */
+
+#if defined(_TARGET_ARM_)
+
+bool CodeGen::genCanUsePopToReturn(regMaskTP maskPopRegsInt, bool jmpEpilog)
+{
+    assert(compiler->compGeneratingEpilog);
+
+    if (!jmpEpilog && regSet.rsMaskPreSpillRegs(true) == RBM_NONE)
+        return true;
+    else
+        return false;
+}
+
+void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog)
+{
+    assert(compiler->compGeneratingEpilog);
+
+    regMaskTP maskPopRegs      = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED;
+    regMaskTP maskPopRegsFloat = maskPopRegs & RBM_ALLFLOAT;
+    regMaskTP maskPopRegsInt   = maskPopRegs & ~maskPopRegsFloat;
+
+    // First, pop float registers
+
+    if (maskPopRegsFloat != RBM_NONE)
+    {
+        genPopFltRegs(maskPopRegsFloat);
+        compiler->unwindPopMaskFloat(maskPopRegsFloat);
+    }
+
+    // Next, pop integer registers
+
+    if (!jmpEpilog)
+    {
+        regMaskTP maskStackAlloc = genStackAllocRegisterMask(compiler->compLclFrameSize, maskPopRegsFloat);
+        maskPopRegsInt |= maskStackAlloc;
+    }
+
+    if (isFramePointerUsed())
+    {
+        assert(!regSet.rsRegsModified(RBM_FPBASE));
+        maskPopRegsInt |= RBM_FPBASE;
+    }
+
+    if (genCanUsePopToReturn(maskPopRegsInt, jmpEpilog))
+    {
+        maskPopRegsInt |= RBM_PC;
+        // Record the fact that we use a pop to the PC to perform the return
+        genUsedPopToReturn = true;
+    }
+    else
+    {
+        maskPopRegsInt |= RBM_LR;
+        // Record the fact that we did not use a pop to the PC to perform the return
+        genUsedPopToReturn = false;
+    }
+
+    assert(FitsIn<int>(maskPopRegsInt));
+    inst_IV(INS_pop, (int)maskPopRegsInt);
+    compiler->unwindPopMaskInt(maskPopRegsInt);
+}
+
+#elif defined(_TARGET_ARM64_)
+
+void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog)
+{
+    assert(compiler->compGeneratingEpilog);
+
+    regMaskTP rsRestoreRegs = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED;
+
+    if (isFramePointerUsed())
+    {
+        rsRestoreRegs |= RBM_FPBASE;
+    }
+
+    rsRestoreRegs |= RBM_LR; // We must save/restore the return address (in the LR register)
+
+    regMaskTP regsToRestoreMask = rsRestoreRegs;
+
+    int totalFrameSize = genTotalFrameSize();
+
+    int calleeSaveSPOffset; // This will be the starting place for restoring the callee-saved registers, in decreasing
+                            // order.
+    int frameType                  = 0; // An indicator of what type of frame we are popping.
+    int calleeSaveSPDelta          = 0;
+    int calleeSaveSPDeltaUnaligned = 0;
+
+    if (isFramePointerUsed())
+    {
+        if ((compiler->lvaOutgoingArgSpaceSize == 0) && (totalFrameSize < 512))
+        {
+            frameType = 1;
+            if (compiler->compLocallocUsed)
+            {
+                // Restore sp from fp
+                //      mov sp, fp
+                inst_RV_RV(INS_mov, REG_SPBASE, REG_FPBASE);
+                compiler->unwindSetFrameReg(REG_FPBASE, 0);
+            }
+
+            regsToRestoreMask &= ~(RBM_FP | RBM_LR); // We'll restore FP/LR at the end, and post-index SP.
+
+            // Compute callee save SP offset which is at the top of local frame while the FP/LR is saved at the bottom
+            // of stack.
+            calleeSaveSPOffset = compiler->compLclFrameSize + 2 * REGSIZE_BYTES;
+        }
+        else if (totalFrameSize <= 512)
+        {
+            frameType = 2;
+            if (compiler->compLocallocUsed)
+            {
+                // Restore sp from fp
+                //      sub sp, fp, #outsz
+                getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_FPBASE,
+                                            compiler->lvaOutgoingArgSpaceSize);
+                compiler->unwindSetFrameReg(REG_FPBASE, compiler->lvaOutgoingArgSpaceSize);
+            }
+
+            regsToRestoreMask &= ~(RBM_FP | RBM_LR); // We'll restore FP/LR at the end, and post-index SP.
+
+            // Compute callee save SP offset which is at the top of local frame while the FP/LR is saved at the bottom
+            // of stack.
+            calleeSaveSPOffset = compiler->compLclFrameSize + 2 * REGSIZE_BYTES;
+        }
+        else
+        {
+            frameType = 3;
+
+            calleeSaveSPDeltaUnaligned = totalFrameSize - compiler->compLclFrameSize -
+                                         2 * REGSIZE_BYTES; // 2 for FP, LR which we'll restore later.
+            assert(calleeSaveSPDeltaUnaligned >= 0);
+            assert((calleeSaveSPDeltaUnaligned % 8) == 0); // It better at least be 8 byte aligned.
+            calleeSaveSPDelta = AlignUp((UINT)calleeSaveSPDeltaUnaligned, STACK_ALIGN);
+
+            regsToRestoreMask &= ~(RBM_FP | RBM_LR); // We'll restore FP/LR at the end, and (hopefully) post-index SP.
+
+            int remainingFrameSz = totalFrameSize - calleeSaveSPDelta;
+            assert(remainingFrameSz > 0);
+
+            if (compiler->lvaOutgoingArgSpaceSize >= 504)
+            {
+                // We can't do "ldp fp,lr,[sp,#outsz]" because #outsz is too big.
+                // If compiler->lvaOutgoingArgSpaceSize is not aligned, we need to align the SP adjustment.
+                assert(remainingFrameSz > (int)compiler->lvaOutgoingArgSpaceSize);
+                int spAdjustment2Unaligned = remainingFrameSz - compiler->lvaOutgoingArgSpaceSize;
+                int spAdjustment2          = (int)roundUp((size_t)spAdjustment2Unaligned, STACK_ALIGN);
+                int alignmentAdjustment2   = spAdjustment2 - spAdjustment2Unaligned;
+                assert((alignmentAdjustment2 == 0) || (alignmentAdjustment2 == REGSIZE_BYTES));
+
+                if (compiler->compLocallocUsed)
+                {
+                    // Restore sp from fp. No need to update sp after this since we've set up fp before adjusting sp in
+                    // prolog.
+                    //      sub sp, fp, #alignmentAdjustment2
+                    getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, alignmentAdjustment2);
+                    compiler->unwindSetFrameReg(REG_FPBASE, alignmentAdjustment2);
+                }
+                else
+                {
+                    // Generate:
+                    //      add sp,sp,#outsz                ; if #outsz is not 16-byte aligned, we need to be more
+                    //                                      ; careful
+                    int spAdjustment3 = compiler->lvaOutgoingArgSpaceSize - alignmentAdjustment2;
+                    assert(spAdjustment3 > 0);
+                    assert((spAdjustment3 % 16) == 0);
+                    genStackPointerAdjustment(spAdjustment3, REG_IP0, nullptr);
+                }
+
+                // Generate:
+                //      ldp fp,lr,[sp]
+                //      add sp,sp,#remainingFrameSz
+                genEpilogRestoreRegPair(REG_FP, REG_LR, alignmentAdjustment2, spAdjustment2, REG_IP0, nullptr);
+            }
+            else
+            {
+                if (compiler->compLocallocUsed)
+                {
+                    // Restore sp from fp
+                    //      sub sp, fp, #outsz
+                    getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_FPBASE,
+                                                compiler->lvaOutgoingArgSpaceSize);
+                    compiler->unwindSetFrameReg(REG_FPBASE, compiler->lvaOutgoingArgSpaceSize);
+                }
+
+                // Generate:
+                //      ldp fp,lr,[sp,#outsz]
+                //      add sp,sp,#remainingFrameSz     ; might need to load this constant in a scratch register if
+                //                                      ; it's large
+
+                genEpilogRestoreRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize, remainingFrameSz, REG_IP0,
+                                        nullptr);
+            }
+
+            // Unlike frameType=1 or frameType=2 that restore SP at the end,
+            // frameType=3 already adjusted SP above to delete local frame.
+            // There is at most one alignment slot between SP and where we store the callee-saved registers.
+            calleeSaveSPOffset = calleeSaveSPDelta - calleeSaveSPDeltaUnaligned;
+            assert((calleeSaveSPOffset == 0) || (calleeSaveSPOffset == REGSIZE_BYTES));
+        }
+    }
+    else
+    {
+        // No frame pointer (no chaining).
+        NYI("Frame without frame pointer");
+        calleeSaveSPOffset = 0;
+    }
+
+    genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, calleeSaveSPOffset, calleeSaveSPDelta);
+
+    if (frameType == 1)
+    {
+        // Generate:
+        //      ldp fp,lr,[sp],#framesz
+
+        getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, totalFrameSize,
+                                      INS_OPTS_POST_INDEX);
+        compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, -totalFrameSize);
+    }
+    else if (frameType == 2)
+    {
+        // Generate:
+        //      ldr fp,lr,[sp,#outsz]
+        //      add sp,sp,#framesz
+
+        getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE,
+                                      compiler->lvaOutgoingArgSpaceSize);
+        compiler->unwindSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize);
+
+        getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, totalFrameSize);
+        compiler->unwindAllocStack(totalFrameSize);
+    }
+    else if (frameType == 3)
+    {
+        // Nothing to do after restoring callee-saved registers.
+    }
+    else
+    {
+        unreached();
+    }
+}
+
+#elif defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+
+void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog)
+{
+    assert(compiler->compGeneratingEpilog);
+
+    unsigned popCount = 0;
+    if (regSet.rsRegsModified(RBM_EBX))
+    {
+        popCount++;
+        inst_RV(INS_pop, REG_EBX, TYP_I_IMPL);
+    }
+    if (regSet.rsRegsModified(RBM_FPBASE))
+    {
+        // EBP cannot be directly modified for EBP frame and double-aligned frames
+        assert(!doubleAlignOrFramePointerUsed());
+
+        popCount++;
+        inst_RV(INS_pop, REG_EBP, TYP_I_IMPL);
+    }
+
+#ifndef UNIX_AMD64_ABI
+    // For System V AMD64 calling convention ESI and EDI are volatile registers.
+    if (regSet.rsRegsModified(RBM_ESI))
+    {
+        popCount++;
+        inst_RV(INS_pop, REG_ESI, TYP_I_IMPL);
+    }
+    if (regSet.rsRegsModified(RBM_EDI))
+    {
+        popCount++;
+        inst_RV(INS_pop, REG_EDI, TYP_I_IMPL);
+    }
+#endif // !defined(UNIX_AMD64_ABI)
+
+#ifdef _TARGET_AMD64_
+    if (regSet.rsRegsModified(RBM_R12))
+    {
+        popCount++;
+        inst_RV(INS_pop, REG_R12, TYP_I_IMPL);
+    }
+    if (regSet.rsRegsModified(RBM_R13))
+    {
+        popCount++;
+        inst_RV(INS_pop, REG_R13, TYP_I_IMPL);
+    }
+    if (regSet.rsRegsModified(RBM_R14))
+    {
+        popCount++;
+        inst_RV(INS_pop, REG_R14, TYP_I_IMPL);
+    }
+    if (regSet.rsRegsModified(RBM_R15))
+    {
+        popCount++;
+        inst_RV(INS_pop, REG_R15, TYP_I_IMPL);
+    }
+#endif // _TARGET_AMD64_
+
+    // Amd64/x86 doesn't support push/pop of xmm registers.
+    // These will get saved to stack separately after allocating
+    // space on stack in prolog sequence.  PopCount is essentially
+    // tracking the count of integer registers pushed.
+
+    noway_assert(compiler->compCalleeRegsPushed == popCount);
+}
+
+#elif defined(_TARGET_X86_)
+
+void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog)
+{
+    assert(compiler->compGeneratingEpilog);
+
+    unsigned popCount = 0;
+
+    /*  NOTE:   The EBP-less frame code below depends on the fact that
+                all of the pops are generated right at the start and
+                each takes one byte of machine code.
+     */
+
+    if (regSet.rsRegsModified(RBM_FPBASE))
+    {
+        // EBP cannot be directly modified for EBP frame and double-aligned frames
+        noway_assert(!doubleAlignOrFramePointerUsed());
+
+        inst_RV(INS_pop, REG_EBP, TYP_I_IMPL);
+        popCount++;
+    }
+    if (regSet.rsRegsModified(RBM_EBX))
+    {
+        popCount++;
+        inst_RV(INS_pop, REG_EBX, TYP_I_IMPL);
+    }
+    if (regSet.rsRegsModified(RBM_ESI))
+    {
+        popCount++;
+        inst_RV(INS_pop, REG_ESI, TYP_I_IMPL);
+    }
+    if (regSet.rsRegsModified(RBM_EDI))
+    {
+        popCount++;
+        inst_RV(INS_pop, REG_EDI, TYP_I_IMPL);
+    }
+    noway_assert(compiler->compCalleeRegsPushed == popCount);
+}
+
+#endif // _TARGET_*
+
+// We need a register with value zero. Zero the initReg, if necessary, and set *pInitRegZeroed if so.
+// Return the register to use. On ARM64, we never touch the initReg, and always just return REG_ZR.
+regNumber CodeGen::genGetZeroReg(regNumber initReg, bool* pInitRegZeroed)
+{
+#ifdef _TARGET_ARM64_
+    return REG_ZR;
+#else  // !_TARGET_ARM64_
+    if (*pInitRegZeroed == false)
+    {
+        instGen_Set_Reg_To_Zero(EA_PTRSIZE, initReg);
+        *pInitRegZeroed = true;
+    }
+    return initReg;
+#endif // !_TARGET_ARM64_
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ * Do we have any untracked pointer locals at all,
+ * or do we need to initialize memory for locspace?
+ *
+ * untrLclHi      - (Untracked locals High-Offset)   The upper bound offset at which the zero init code will end
+ * initializing memory (not inclusive).
+ * untrLclLo      - (Untracked locals Low-Offset)    The lower bound at which the zero init code will start zero
+ * initializing memory.
+ * initReg        - A scratch register (that gets set to zero on some platforms).
+ * pInitRegZeroed - Sets a flag that tells the callee whether or not the initReg register got zeroed.
+ */
+void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg, bool* pInitRegZeroed)
+{
+    assert(compiler->compGeneratingProlog);
+
+    if (genUseBlockInit)
+    {
+        assert(untrLclHi > untrLclLo);
+#ifdef _TARGET_ARMARCH_
+        /*
+            Generate the following code:
+
+            For cnt less than 10
+
+                mov     rZero1, 0
+                mov     rZero2, 0
+                mov     rCnt,  <cnt>
+                stm     <rZero1,rZero2>,[rAddr!]
+    <optional>  stm     <rZero1,rZero2>,[rAddr!]
+    <optional>  stm     <rZero1,rZero2>,[rAddr!]
+    <optional>  stm     <rZero1,rZero2>,[rAddr!]
+    <optional>  str     rZero1,[rAddr]
+
+            For rCnt greater than or equal to 10
+
+                mov     rZero1, 0
+                mov     rZero2, 0
+                mov     rCnt,  <cnt/2>
+                sub     rAddr, sp, OFFS
+
+            loop:
+                stm     <rZero1,rZero2>,[rAddr!]
+                sub     rCnt,rCnt,1
+                jnz     loop
+
+    <optional>  str     rZero1,[rAddr]   // When cnt is odd
+
+            NOTE: for ARM64, the instruction is stp, not stm. And we can use ZR instead of allocating registers.
+         */
+
+        regNumber rAddr;
+        regNumber rCnt = REG_NA; // Invalid
+        regMaskTP regMask;
+
+        regMaskTP availMask = regSet.rsGetModifiedRegsMask() | RBM_INT_CALLEE_TRASH; // Set of available registers
+        availMask &= ~intRegState.rsCalleeRegArgMaskLiveIn; // Remove all of the incoming argument registers as they are
+                                                            // currently live
+        availMask &= ~genRegMask(initReg); // Remove the pre-calculated initReg as we will zero it and maybe use it for
+                                           // a large constant.
+
+#if defined(_TARGET_ARM_)
+
+        if (compiler->compLocallocUsed)
+        {
+            availMask &= ~RBM_SAVED_LOCALLOC_SP; // Remove the register reserved when we have a localloc frame
+        }
+
+        regNumber rZero1; // We're going to use initReg for rZero1
+        regNumber rZero2;
+
+        // We pick the next lowest register number for rZero2
+        noway_assert(availMask != RBM_NONE);
+        regMask = genFindLowestBit(availMask);
+        rZero2  = genRegNumFromMask(regMask);
+        availMask &= ~regMask;
+        assert((genRegMask(rZero2) & intRegState.rsCalleeRegArgMaskLiveIn) ==
+               0); // rZero2 is not a live incoming argument reg
+
+        // We pick the next lowest register number for rAddr
+        noway_assert(availMask != RBM_NONE);
+        regMask = genFindLowestBit(availMask);
+        rAddr   = genRegNumFromMask(regMask);
+        availMask &= ~regMask;
+
+#else // !define(_TARGET_ARM_)
+
+        regNumber rZero1 = REG_ZR;
+        rAddr            = initReg;
+        *pInitRegZeroed  = false;
+
+#endif // !defined(_TARGET_ARM_)
+
+        bool     useLoop   = false;
+        unsigned uCntBytes = untrLclHi - untrLclLo;
+        assert((uCntBytes % sizeof(int)) == 0);         // The smallest stack slot is always 4 bytes.
+        unsigned uCntSlots = uCntBytes / REGSIZE_BYTES; // How many register sized stack slots we're going to use.
+
+        // When uCntSlots is 9 or less, we will emit a sequence of stm/stp instructions inline.
+        // When it is 10 or greater, we will emit a loop containing a stm/stp instruction.
+        // In both of these cases the stm/stp instruction will write two zeros to memory
+        // and we will use a single str instruction at the end whenever we have an odd count.
+        if (uCntSlots >= 10)
+            useLoop = true;
+
+        if (useLoop)
+        {
+            // We pick the next lowest register number for rCnt
+            noway_assert(availMask != RBM_NONE);
+            regMask = genFindLowestBit(availMask);
+            rCnt    = genRegNumFromMask(regMask);
+            availMask &= ~regMask;
+        }
+
+        assert((genRegMask(rAddr) & intRegState.rsCalleeRegArgMaskLiveIn) ==
+               0); // rAddr is not a live incoming argument reg
+#if defined(_TARGET_ARM_)
+        if (arm_Valid_Imm_For_Add(untrLclLo, INS_FLAGS_DONT_CARE))
+#else  // !_TARGET_ARM_
+        if (emitter::emitIns_valid_imm_for_add(untrLclLo, EA_PTRSIZE))
+#endif // !_TARGET_ARM_
+        {
+            getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, rAddr, genFramePointerReg(), untrLclLo);
+        }
+        else
+        {
+            // Load immediate into the InitReg register
+            instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, (ssize_t)untrLclLo);
+            getEmitter()->emitIns_R_R_R(INS_add, EA_PTRSIZE, rAddr, genFramePointerReg(), initReg);
+            *pInitRegZeroed = false;
+        }
+
+        if (useLoop)
+        {
+            noway_assert(uCntSlots >= 2);
+            assert((genRegMask(rCnt) & intRegState.rsCalleeRegArgMaskLiveIn) ==
+                   0); // rCnt is not a live incoming argument reg
+            instGen_Set_Reg_To_Imm(EA_PTRSIZE, rCnt, (ssize_t)uCntSlots / 2);
+        }
+
+#if defined(_TARGET_ARM_)
+        rZero1 = genGetZeroReg(initReg, pInitRegZeroed);
+        instGen_Set_Reg_To_Zero(EA_PTRSIZE, rZero2);
+        ssize_t stmImm = (ssize_t)(genRegMask(rZero1) | genRegMask(rZero2));
+#endif // _TARGET_ARM_
+
+        if (!useLoop)
+        {
+            while (uCntBytes >= REGSIZE_BYTES * 2)
+            {
+#ifdef _TARGET_ARM_
+                getEmitter()->emitIns_R_I(INS_stm, EA_PTRSIZE, rAddr, stmImm);
+#else  // !_TARGET_ARM_
+                getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, rAddr, 2 * REGSIZE_BYTES,
+                                              INS_OPTS_POST_INDEX);
+#endif // !_TARGET_ARM_
+                uCntBytes -= REGSIZE_BYTES * 2;
+            }
+        }
+        else // useLoop is true
+        {
+#ifdef _TARGET_ARM_
+            getEmitter()->emitIns_R_I(INS_stm, EA_PTRSIZE, rAddr, stmImm); // zero stack slots
+            getEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, rCnt, 1, INS_FLAGS_SET);
+#else  // !_TARGET_ARM_
+            getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, rAddr, 2 * REGSIZE_BYTES,
+                                          INS_OPTS_POST_INDEX); // zero stack slots
+            getEmitter()->emitIns_R_R_I(INS_subs, EA_PTRSIZE, rCnt, rCnt, 1);
+#endif // !_TARGET_ARM_
+            getEmitter()->emitIns_J(INS_bhi, NULL, -3);
+            uCntBytes %= REGSIZE_BYTES * 2;
+        }
+
+        if (uCntBytes >= REGSIZE_BYTES) // check and zero the last register-sized stack slot (odd number)
+        {
+#ifdef _TARGET_ARM_
+            getEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, rZero1, rAddr, 0);
+#else  // _TARGET_ARM_
+            if ((uCntBytes - REGSIZE_BYTES) == 0)
+            {
+                getEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, REG_ZR, rAddr, 0);
+            }
+            else
+            {
+                getEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, REG_ZR, rAddr, REGSIZE_BYTES, INS_OPTS_POST_INDEX);
+            }
+#endif // !_TARGET_ARM_
+            uCntBytes -= REGSIZE_BYTES;
+        }
+#ifdef _TARGET_ARM64_
+        if (uCntBytes > 0)
+        {
+            assert(uCntBytes == sizeof(int));
+            getEmitter()->emitIns_R_R_I(INS_str, EA_4BYTE, REG_ZR, rAddr, 0);
+            uCntBytes -= sizeof(int);
+        }
+#endif // _TARGET_ARM64_
+        noway_assert(uCntBytes == 0);
+
+#elif defined(_TARGET_XARCH_)
+        /*
+            Generate the following code:
+
+                lea     edi, [ebp/esp-OFFS]
+                mov     ecx, <size>
+                xor     eax, eax
+                rep     stosd
+         */
+
+        noway_assert(regSet.rsRegsModified(RBM_EDI));
+
+#ifdef UNIX_AMD64_ABI
+        // For register arguments we may have to save ECX and RDI on Amd64 System V OSes
+        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RCX)
+        {
+            noway_assert(regSet.rsRegsModified(RBM_R12));
+            inst_RV_RV(INS_mov, REG_R12, REG_RCX);
+            regTracker.rsTrackRegTrash(REG_R12);
+        }
+
+        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RDI)
+        {
+            noway_assert(regSet.rsRegsModified(RBM_R13));
+            inst_RV_RV(INS_mov, REG_R13, REG_RDI);
+            regTracker.rsTrackRegTrash(REG_R13);
+        }
+#else  // !UNIX_AMD64_ABI
+        // For register arguments we may have to save ECX
+        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_ECX)
+        {
+            noway_assert(regSet.rsRegsModified(RBM_ESI));
+            inst_RV_RV(INS_mov, REG_ESI, REG_ECX);
+            regTracker.rsTrackRegTrash(REG_ESI);
+        }
+#endif // !UNIX_AMD64_ABI
+
+        noway_assert((intRegState.rsCalleeRegArgMaskLiveIn & RBM_EAX) == 0);
+
+        getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_EDI, genFramePointerReg(), untrLclLo);
+        regTracker.rsTrackRegTrash(REG_EDI);
+
+        inst_RV_IV(INS_mov, REG_ECX, (untrLclHi - untrLclLo) / sizeof(int), EA_4BYTE);
+        instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EAX);
+        instGen(INS_r_stosd);
+
+#ifdef UNIX_AMD64_ABI
+        // Move back the argument registers
+        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RCX)
+        {
+            inst_RV_RV(INS_mov, REG_RCX, REG_R12);
+        }
+
+        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RDI)
+        {
+            inst_RV_RV(INS_mov, REG_RDI, REG_R13);
+        }
+#else  // !UNIX_AMD64_ABI
+        // Move back the argument registers
+        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_ECX)
+        {
+            inst_RV_RV(INS_mov, REG_ECX, REG_ESI);
+        }
+#endif // !UNIX_AMD64_ABI
+
+#else // _TARGET_*
+#error Unsupported or unset target architecture
+#endif // _TARGET_*
+    }
+    else if (genInitStkLclCnt > 0)
+    {
+        assert((genRegMask(initReg) & intRegState.rsCalleeRegArgMaskLiveIn) ==
+               0); // initReg is not a live incoming argument reg
+
+        /* Initialize any lvMustInit vars on the stack */
+
+        LclVarDsc* varDsc;
+        unsigned   varNum;
+
+        for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->lvaCount; varNum++, varDsc++)
+        {
+            if (!varDsc->lvMustInit)
+            {
+                continue;
+            }
+
+            // TODO-Review: I'm not sure that we're correctly handling the mustInit case for
+            // partially-enregistered vars in the case where we don't use a block init.
+            noway_assert(varDsc->lvIsInReg() || varDsc->lvOnFrame);
+
+            // lvMustInit can only be set for GC types or TYP_STRUCT types
+            // or when compInitMem is true
+            // or when in debug code
+
+            noway_assert(varTypeIsGC(varDsc->TypeGet()) || (varDsc->TypeGet() == TYP_STRUCT) ||
+                         compiler->info.compInitMem || compiler->opts.compDbgCode);
+
+#ifdef _TARGET_64BIT_
+            if (!varDsc->lvOnFrame)
+            {
+                continue;
+            }
+#else  // !_TARGET_64BIT_
+            if (varDsc->lvRegister)
+            {
+                if (varDsc->lvOnFrame)
+                {
+                    /* This is a partially enregistered TYP_LONG var */
+                    noway_assert(varDsc->lvOtherReg == REG_STK);
+                    noway_assert(varDsc->lvType == TYP_LONG);
+
+                    noway_assert(compiler->info.compInitMem);
+
+                    getEmitter()->emitIns_S_R(ins_Store(TYP_INT), EA_4BYTE, genGetZeroReg(initReg, pInitRegZeroed),
+                                              varNum, sizeof(int));
+                }
+                continue;
+            }
+#endif // !_TARGET_64BIT_
+
+            if ((varDsc->TypeGet() == TYP_STRUCT) && !compiler->info.compInitMem &&
+                (varDsc->lvExactSize >= TARGET_POINTER_SIZE))
+            {
+                // We only initialize the GC variables in the TYP_STRUCT
+                const unsigned slots  = (unsigned)compiler->lvaLclSize(varNum) / REGSIZE_BYTES;
+                const BYTE*    gcPtrs = compiler->lvaGetGcLayout(varNum);
+
+                for (unsigned i = 0; i < slots; i++)
+                {
+                    if (gcPtrs[i] != TYPE_GC_NONE)
+                    {
+                        getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE,
+                                                  genGetZeroReg(initReg, pInitRegZeroed), varNum, i * REGSIZE_BYTES);
+                    }
+                }
+            }
+            else
+            {
+                regNumber zeroReg = genGetZeroReg(initReg, pInitRegZeroed);
+
+                // zero out the whole thing rounded up to a single stack slot size
+                unsigned lclSize = (unsigned)roundUp(compiler->lvaLclSize(varNum), sizeof(int));
+                unsigned i;
+                for (i = 0; i + REGSIZE_BYTES <= lclSize; i += REGSIZE_BYTES)
+                {
+                    getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, zeroReg, varNum, i);
+                }
+
+#ifdef _TARGET_64BIT_
+                assert(i == lclSize || (i + sizeof(int) == lclSize));
+                if (i != lclSize)
+                {
+                    getEmitter()->emitIns_S_R(ins_Store(TYP_INT), EA_4BYTE, zeroReg, varNum, i);
+                    i += sizeof(int);
+                }
+#endif // _TARGET_64BIT_
+                assert(i == lclSize);
+            }
+        }
+
+        if (!TRACK_GC_TEMP_LIFETIMES)
+        {
+            assert(compiler->tmpAllFree());
+            for (TempDsc* tempThis = compiler->tmpListBeg(); tempThis != nullptr;
+                 tempThis          = compiler->tmpListNxt(tempThis))
+            {
+                if (!varTypeIsGC(tempThis->tdTempType()))
+                {
+                    continue;
+                }
+
+                // printf("initialize untracked spillTmp [EBP-%04X]\n", stkOffs);
+
+                inst_ST_RV(ins_Store(TYP_I_IMPL), tempThis, 0, genGetZeroReg(initReg, pInitRegZeroed), TYP_I_IMPL);
+            }
+        }
+    }
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ *  Save the generic context argument.
+ *
+ *  We need to do this within the "prolog" in case anyone tries to inspect
+ *  the param-type-arg/this (which can be done after the prolog) using
+ *  ICodeManager::GetParamTypeArg().
+ */
+
+void CodeGen::genReportGenericContextArg(regNumber initReg, bool* pInitRegZeroed)
+{
+    assert(compiler->compGeneratingProlog);
+
+    bool reportArg = compiler->lvaReportParamTypeArg();
+
+    // We should report either generic context arg or "this" when used so.
+    if (!reportArg)
+    {
+#ifndef JIT32_GCENCODER
+        if (!compiler->lvaKeepAliveAndReportThis())
+#endif
+        {
+            return;
+        }
+    }
+
+    // For JIT32_GCENCODER, we won't be here if reportArg is false.
+    unsigned contextArg = reportArg ? compiler->info.compTypeCtxtArg : compiler->info.compThisArg;
+
+    noway_assert(contextArg != BAD_VAR_NUM);
+    LclVarDsc* varDsc = &compiler->lvaTable[contextArg];
+
+    // We are still in the prolog and compiler->info.compTypeCtxtArg has not been
+    // moved to its final home location. So we need to use it from the
+    // incoming location.
+
+    regNumber reg;
+
+    bool isPrespilledForProfiling = false;
+#if defined(_TARGET_ARM_) && defined(PROFILING_SUPPORTED)
+    isPrespilledForProfiling =
+        compiler->compIsProfilerHookNeeded() && compiler->lvaIsPreSpilled(contextArg, regSet.rsMaskPreSpillRegs(false));
+#endif
+
+    // Load from the argument register only if it is not prespilled.
+    if (compiler->lvaIsRegArgument(contextArg) && !isPrespilledForProfiling)
+    {
+        reg = varDsc->lvArgReg;
+    }
+    else
+    {
+        if (isFramePointerUsed())
+        {
+#if defined(_TARGET_ARM_)
+            // lvStkOffs is always valid for incoming stack-arguments, even if the argument
+            // will become enregistered.
+            // On Arm compiler->compArgSize doesn't include r11 and lr sizes and hence we need to add 2*REGSIZE_BYTES
+            noway_assert((2 * REGSIZE_BYTES <= varDsc->lvStkOffs) &&
+                         (size_t(varDsc->lvStkOffs) < compiler->compArgSize + 2 * REGSIZE_BYTES));
+#else
+            // lvStkOffs is always valid for incoming stack-arguments, even if the argument
+            // will become enregistered.
+            noway_assert((0 < varDsc->lvStkOffs) && (size_t(varDsc->lvStkOffs) < compiler->compArgSize));
+#endif
+        }
+
+        // We will just use the initReg since it is an available register
+        // and we are probably done using it anyway...
+        reg             = initReg;
+        *pInitRegZeroed = false;
+
+        // mov reg, [compiler->info.compTypeCtxtArg]
+        getEmitter()->emitIns_R_AR(ins_Load(TYP_I_IMPL), EA_PTRSIZE, reg, genFramePointerReg(), varDsc->lvStkOffs);
+        regTracker.rsTrackRegTrash(reg);
+    }
+
+#if CPU_LOAD_STORE_ARCH
+    getEmitter()->emitIns_R_R_I(ins_Store(TYP_I_IMPL), EA_PTRSIZE, reg, genFramePointerReg(),
+                                compiler->lvaCachedGenericContextArgOffset());
+#else  // CPU_LOAD_STORE_ARCH
+    // mov [ebp-lvaCachedGenericContextArgOffset()], reg
+    getEmitter()->emitIns_AR_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, reg, genFramePointerReg(),
+                               compiler->lvaCachedGenericContextArgOffset());
+#endif // !CPU_LOAD_STORE_ARCH
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ *  Set the "GS" security cookie in the prolog.
+ */
+
+void CodeGen::genSetGSSecurityCookie(regNumber initReg, bool* pInitRegZeroed)
+{
+    assert(compiler->compGeneratingProlog);
+
+    if (!compiler->getNeedsGSSecurityCookie())
+    {
+        return;
+    }
+
+    noway_assert(compiler->gsGlobalSecurityCookieAddr || compiler->gsGlobalSecurityCookieVal);
+
+    if (compiler->gsGlobalSecurityCookieAddr == nullptr)
+    {
+#ifdef _TARGET_AMD64_
+        // eax = #GlobalSecurityCookieVal64; [frame.GSSecurityCookie] = eax
+        getEmitter()->emitIns_R_I(INS_mov, EA_PTRSIZE, REG_RAX, compiler->gsGlobalSecurityCookieVal);
+        getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_RAX, compiler->lvaGSSecurityCookie, 0);
+#else
+        //  mov   dword ptr [frame.GSSecurityCookie], #GlobalSecurityCookieVal
+        instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, compiler->gsGlobalSecurityCookieVal,
+                                   compiler->lvaGSSecurityCookie, 0, initReg);
+#endif
+    }
+    else
+    {
+        regNumber reg;
+#ifdef _TARGET_XARCH_
+        // Always use EAX on x86 and x64
+        // On x64, if we're not moving into RAX, and the address isn't RIP relative, we can't encode it.
+        reg = REG_EAX;
+#else
+        // We will just use the initReg since it is an available register
+        reg = initReg;
+#endif
+
+        *pInitRegZeroed = false;
+
+#if CPU_LOAD_STORE_ARCH
+        instGen_Set_Reg_To_Imm(EA_PTR_DSP_RELOC, reg, (ssize_t)compiler->gsGlobalSecurityCookieAddr);
+        getEmitter()->emitIns_R_R_I(ins_Load(TYP_I_IMPL), EA_PTRSIZE, reg, reg, 0);
+        regTracker.rsTrackRegTrash(reg);
+#else
+        //  mov   reg, dword ptr [compiler->gsGlobalSecurityCookieAddr]
+        //  mov   dword ptr [frame.GSSecurityCookie], reg
+        getEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, reg, (ssize_t)compiler->gsGlobalSecurityCookieAddr);
+        regTracker.rsTrackRegTrash(reg);
+#endif
+        getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, reg, compiler->lvaGSSecurityCookie, 0);
+    }
+}
+
+#ifdef PROFILING_SUPPORTED
+
+/*-----------------------------------------------------------------------------
+ *
+ *  Generate the profiling function enter callback.
+ */
+
+void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed)
+{
+    assert(compiler->compGeneratingProlog);
+
+    // Give profiler a chance to back out of hooking this method
+    if (!compiler->compIsProfilerHookNeeded())
+    {
+        return;
+    }
+
+#ifndef LEGACY_BACKEND
+#if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI) // No profiling for System V systems yet.
+    unsigned   varNum;
+    LclVarDsc* varDsc;
+
+    // Since the method needs to make a profiler callback, it should have out-going arg space allocated.
+    noway_assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
+    noway_assert(compiler->lvaOutgoingArgSpaceSize >= (4 * REGSIZE_BYTES));
+
+    // Home all arguments passed in arg registers (RCX, RDX, R8 and R9).
+    // In case of vararg methods, arg regs are already homed.
+    //
+    // Note: Here we don't need to worry about updating gc'info since enter
+    // callback is generated as part of prolog which is non-gc interruptible.
+    // Moreover GC cannot kick while executing inside profiler callback which is a
+    // profiler requirement so it can examine arguments which could be obj refs.
+    if (!compiler->info.compIsVarArgs)
+    {
+        for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->info.compArgsCount; varNum++, varDsc++)
+        {
+            noway_assert(varDsc->lvIsParam);
+
+            if (!varDsc->lvIsRegArg)
+            {
+                continue;
+            }
+
+            var_types storeType = varDsc->lvaArgType();
+            regNumber argReg    = varDsc->lvArgReg;
+            getEmitter()->emitIns_S_R(ins_Store(storeType), emitTypeSize(storeType), argReg, varNum, 0);
+        }
+    }
+
+    // Emit profiler EnterCallback(ProfilerMethHnd, caller's SP)
+    // RCX = ProfilerMethHnd
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        // Profiler hooks enabled during Ngen time.
+        // Profiler handle needs to be accessed through an indirection of a pointer.
+        getEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        // No need to record relocations, if we are generating ELT hooks under the influence
+        // of complus_JitELtHookEnabled=1
+        if (compiler->opts.compJitELTHookEnabled)
+        {
+            genSetRegToIcon(REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
+        }
+        else
+        {
+            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+        }
+    }
+
+    // RDX = caller's SP
+    // Notes
+    //   1) Here we can query caller's SP offset since prolog will be generated after final frame layout.
+    //   2) caller's SP relative offset to FramePointer will be negative.  We need to add absolute value
+    //      of that offset to FramePointer to obtain caller's SP value.
+    assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
+    int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
+    getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_ARG_1, genFramePointerReg(), -callerSPOffset);
+
+    // Can't have a call until we have enough padding for rejit
+    genPrologPadForReJit();
+
+    // This will emit either
+    // "call ip-relative 32-bit offset" or
+    // "mov rax, helper addr; call rax"
+    genEmitHelperCall(CORINFO_HELP_PROF_FCN_ENTER, 0, EA_UNKNOWN);
+
+    // TODO-AMD64-CQ: Rather than reloading, see if this could be optimized by combining with prolog
+    // generation logic that moves args around as required by first BB entry point conditions
+    // computed by LSRA.  Code pointers for investigating this further: genFnPrologCalleeRegArgs()
+    // and genEnregisterIncomingStackArgs().
+    //
+    // Now reload arg registers from home locations.
+    // Vararg methods:
+    //   - we need to reload only known (i.e. fixed) reg args.
+    //   - if floating point type, also reload it into corresponding integer reg
+    for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->info.compArgsCount; varNum++, varDsc++)
+    {
+        noway_assert(varDsc->lvIsParam);
+
+        if (!varDsc->lvIsRegArg)
+        {
+            continue;
+        }
+
+        var_types loadType = varDsc->lvaArgType();
+        regNumber argReg   = varDsc->lvArgReg;
+        getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
+
+#if FEATURE_VARARG
+        if (compiler->info.compIsVarArgs && varTypeIsFloating(loadType))
+        {
+            regNumber   intArgReg = compiler->getCallArgIntRegister(argReg);
+            instruction ins       = ins_CopyFloatToInt(loadType, TYP_LONG);
+            inst_RV_RV(ins, argReg, intArgReg, loadType);
+        }
+#endif //  FEATURE_VARARG
+    }
+
+    // If initReg is one of RBM_CALLEE_TRASH, then it needs to be zero'ed before using.
+    if ((RBM_CALLEE_TRASH & genRegMask(initReg)) != 0)
+    {
+        *pInitRegZeroed = false;
+    }
+
+#else //!_TARGET_AMD64_
+    NYI("RyuJIT: Emit Profiler Enter callback");
+#endif
+
+#else // LEGACY_BACKEND
+
+    unsigned saveStackLvl2 = genStackLevel;
+
+#if defined(_TARGET_X86_)
+    // Important note: when you change enter probe layout, you must also update SKIP_ENTER_PROF_CALLBACK()
+    // for x86 stack unwinding
+
+    // Push the profilerHandle
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        getEmitter()->emitIns_AR_R(INS_push, EA_PTR_DSP_RELOC, REG_NA, REG_NA, (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        inst_IV(INS_push, (size_t)compiler->compProfilerMethHnd);
+    }
+#elif defined(_TARGET_ARM_)
+    // On Arm arguments are prespilled on stack, which frees r0-r3.
+    // For generating Enter callout we would need two registers and one of them has to be r0 to pass profiler handle.
+    // The call target register could be any free register.
+    regNumber argReg = regSet.rsGrabReg(RBM_PROFILER_ENTER_ARG);
+    noway_assert(argReg == REG_PROFILER_ENTER_ARG);
+    regSet.rsLockReg(RBM_PROFILER_ENTER_ARG);
+
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        getEmitter()->emitIns_R_AI(INS_ldr, EA_PTR_DSP_RELOC, argReg, (ssize_t)compiler->compProfilerMethHnd);
+        regTracker.rsTrackRegTrash(argReg);
+    }
+    else
+    {
+        instGen_Set_Reg_To_Imm(EA_4BYTE, argReg, (ssize_t)compiler->compProfilerMethHnd);
+    }
+#else  // _TARGET_*
+    NYI("Pushing the profilerHandle & caller's sp for the profiler callout and locking registers");
+#endif // _TARGET_*
+
+    //
+    // Can't have a call until we have enough padding for rejit
+    //
+    genPrologPadForReJit();
+
+    // This will emit either
+    // "call ip-relative 32-bit offset" or
+    // "mov rax, helper addr; call rax"
+    genEmitHelperCall(CORINFO_HELP_PROF_FCN_ENTER,
+                      0,           // argSize. Again, we have to lie about it
+                      EA_UNKNOWN); // retSize
+
+#if defined(_TARGET_X86_)
+    //
+    // Adjust the number of stack slots used by this managed method if necessary.
+    //
+    if (compiler->fgPtrArgCntMax < 1)
+    {
+        compiler->fgPtrArgCntMax = 1;
+    }
+#elif defined(_TARGET_ARM_)
+    // Unlock registers
+    regSet.rsUnlockReg(RBM_PROFILER_ENTER_ARG);
+
+    if (initReg == argReg)
+    {
+        *pInitRegZeroed = false;
+    }
+#else  // _TARGET_*
+    NYI("Pushing the profilerHandle & caller's sp for the profiler callout and locking registers");
+#endif // _TARGET_*
+
+    /* Restore the stack level */
+
+    genStackLevel = saveStackLvl2;
+#endif // LEGACY_BACKEND
+}
+
+/*****************************************************************************
+ *
+ *  Generates Leave profiler hook.
+ *  Technically, this is not part of the epilog; it is called when we are generating code for a GT_RETURN node.
+ */
+
+void CodeGen::genProfilingLeaveCallback(unsigned helper /*= CORINFO_HELP_PROF_FCN_LEAVE*/)
+{
+    // Only hook if profiler says it's okay.
+    if (!compiler->compIsProfilerHookNeeded())
+    {
+        return;
+    }
+
+    compiler->info.compProfilerCallback = true;
+
+    // Need to save on to the stack level, since the callee will pop the argument
+    unsigned saveStackLvl2 = genStackLevel;
+
+#ifndef LEGACY_BACKEND
+
+#if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI) // No profiling for System V systems yet.
+    // Since the method needs to make a profiler callback, it should have out-going arg space allocated.
+    noway_assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
+    noway_assert(compiler->lvaOutgoingArgSpaceSize >= (4 * REGSIZE_BYTES));
+
+    // If thisPtr needs to be kept alive and reported, it cannot be one of the callee trash
+    // registers that profiler callback kills.
+    if (compiler->lvaKeepAliveAndReportThis() && compiler->lvaTable[compiler->info.compThisArg].lvIsInReg())
+    {
+        regMaskTP thisPtrMask = genRegMask(compiler->lvaTable[compiler->info.compThisArg].lvRegNum);
+        noway_assert((RBM_PROFILER_LEAVE_TRASH & thisPtrMask) == 0);
+    }
+
+    // At this point return value is computed and stored in RAX or XMM0.
+    // On Amd64, Leave callback preserves the return register.  We keep
+    // RAX alive by not reporting as trashed by helper call.  Also note
+    // that GC cannot kick-in while executing inside profiler callback,
+    // which is a requirement of profiler as well since it needs to examine
+    // return value which could be an obj ref.
+
+    // RCX = ProfilerMethHnd
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        // Profiler hooks enabled during Ngen time.
+        // Profiler handle needs to be accessed through an indirection of an address.
+        getEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        // Don't record relocations, if we are generating ELT hooks under the influence
+        // of complus_JitELtHookEnabled=1
+        if (compiler->opts.compJitELTHookEnabled)
+        {
+            genSetRegToIcon(REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
+        }
+        else
+        {
+            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+        }
+    }
+
+    // RDX = caller's SP
+    // TODO-AMD64-Cleanup: Once we start doing codegen after final frame layout, retain the "if" portion
+    // of the stmnts to execute unconditionally and clean-up rest.
+    if (compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT)
+    {
+        // Caller's SP relative offset to FramePointer will be negative.  We need to add absolute
+        // value of that offset to FramePointer to obtain caller's SP value.
+        int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
+        getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_ARG_1, genFramePointerReg(), -callerSPOffset);
+    }
+    else
+    {
+        // If we are here means that it is a tentative frame layout during which we
+        // cannot use caller's SP offset since it is an estimate.  For now we require the
+        // method to have at least a single arg so that we can use it to obtain caller's
+        // SP.
+        LclVarDsc* varDsc = compiler->lvaTable;
+        NYI_IF((varDsc == nullptr) || !varDsc->lvIsParam, "Profiler ELT callback for a method without any params");
+
+        // lea rdx, [FramePointer + Arg0's offset]
+        getEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, REG_ARG_1, 0, 0);
+    }
+
+    // We can use any callee trash register (other than RAX, RCX, RDX) for call target.
+    // We use R8 here. This will emit either
+    // "call ip-relative 32-bit offset" or
+    // "mov r8, helper addr; call r8"
+    genEmitHelperCall(helper, 0, EA_UNKNOWN, REG_ARG_2);
+
+#else  //!_TARGET_AMD64_
+    NYI("RyuJIT: Emit Profiler Leave callback");
+#endif // _TARGET_*
+
+#else // LEGACY_BACKEND
+
+#if defined(_TARGET_X86_)
+    //
+    // Push the profilerHandle
+    //
+
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        getEmitter()->emitIns_AR_R(INS_push, EA_PTR_DSP_RELOC, REG_NA, REG_NA, (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        inst_IV(INS_push, (size_t)compiler->compProfilerMethHnd);
+    }
+    genSinglePush();
+
+    genEmitHelperCall(CORINFO_HELP_PROF_FCN_LEAVE,
+                      sizeof(int) * 1, // argSize
+                      EA_UNKNOWN);     // retSize
+
+    //
+    // Adjust the number of stack slots used by this managed method if necessary.
+    //
+    if (compiler->fgPtrArgCntMax < 1)
+    {
+        compiler->fgPtrArgCntMax = 1;
+    }
+#elif defined(_TARGET_ARM_)
+    //
+    // Push the profilerHandle
+    //
+
+    // We could optimize register usage based on return value is int/long/void. But to keep it simple we will lock
+    // RBM_PROFILER_RET_USED always.
+    regNumber scratchReg = regSet.rsGrabReg(RBM_PROFILER_RET_SCRATCH);
+    noway_assert(scratchReg == REG_PROFILER_RET_SCRATCH);
+    regSet.rsLockReg(RBM_PROFILER_RET_USED);
+
+    // Contract between JIT and Profiler Leave callout on arm:
+    // Return size <= 4 bytes: REG_PROFILER_RET_SCRATCH will contain return value
+    // Return size > 4 and <= 8: <REG_PROFILER_RET_SCRATCH,r1> will contain return value.
+    // Floating point or double or HFA return values will be in s0-s15 in case of non-vararg methods.
+    // It is assumed that profiler Leave callback doesn't trash registers r1,REG_PROFILER_RET_SCRATCH and s0-s15.
+    //
+    // In the following cases r0 doesn't contain a return value and hence need not be preserved before emitting Leave
+    // callback.
+    bool     r0Trashed;
+    emitAttr attr = EA_UNKNOWN;
+
+    if (compiler->info.compRetType == TYP_VOID ||
+        (!compiler->info.compIsVarArgs && !compiler->opts.compUseSoftFP && (varTypeIsFloating(compiler->info.compRetType) ||
+                                           compiler->IsHfa(compiler->info.compMethodInfo->args.retTypeClass))))
+    {
+        r0Trashed = false;
+    }
+    else
+    {
+        // Has a return value and r0 is in use. For emitting Leave profiler callout we would need r0 for passing
+        // profiler handle. Therefore, r0 is moved to REG_PROFILER_RETURN_SCRATCH as per contract.
+        if (RBM_ARG_0 & gcInfo.gcRegGCrefSetCur)
+        {
+            attr = EA_GCREF;
+            gcInfo.gcMarkRegSetGCref(RBM_PROFILER_RET_SCRATCH);
+        }
+        else if (RBM_ARG_0 & gcInfo.gcRegByrefSetCur)
+        {
+            attr = EA_BYREF;
+            gcInfo.gcMarkRegSetByref(RBM_PROFILER_RET_SCRATCH);
+        }
+        else
+        {
+            attr = EA_4BYTE;
+        }
+
+        getEmitter()->emitIns_R_R(INS_mov, attr, REG_PROFILER_RET_SCRATCH, REG_ARG_0);
+        regTracker.rsTrackRegTrash(REG_PROFILER_RET_SCRATCH);
+        gcInfo.gcMarkRegSetNpt(RBM_ARG_0);
+        r0Trashed = true;
+    }
+
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        getEmitter()->emitIns_R_AI(INS_ldr, EA_PTR_DSP_RELOC, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+        regTracker.rsTrackRegTrash(REG_ARG_0);
+    }
+    else
+    {
+        instGen_Set_Reg_To_Imm(EA_4BYTE, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+    }
+
+    genEmitHelperCall(CORINFO_HELP_PROF_FCN_LEAVE,
+                      0,           // argSize
+                      EA_UNKNOWN); // retSize
+
+    // Restore state that existed before profiler callback
+    if (r0Trashed)
+    {
+        getEmitter()->emitIns_R_R(INS_mov, attr, REG_ARG_0, REG_PROFILER_RET_SCRATCH);
+        regTracker.rsTrackRegTrash(REG_ARG_0);
+        gcInfo.gcMarkRegSetNpt(RBM_PROFILER_RET_SCRATCH);
+    }
+
+    regSet.rsUnlockReg(RBM_PROFILER_RET_USED);
+#else  // _TARGET_*
+    NYI("Pushing the profilerHandle & caller's sp for the profiler callout and locking them");
+#endif // _TARGET_*
+
+#endif // LEGACY_BACKEND
+
+    /* Restore the stack level */
+    genStackLevel = saveStackLvl2;
+}
+
+#endif // PROFILING_SUPPORTED
+
+/*****************************************************************************
+
+Esp frames :
+----------
+
+These instructions are just a reordering of the instructions used today.
+
+push ebp
+push esi
+push edi
+push ebx
+sub esp, LOCALS_SIZE / push dummyReg if LOCALS_SIZE=sizeof(void*)
+...
+add esp, LOCALS_SIZE / pop dummyReg
+pop ebx
+pop edi
+pop esi
+pop ebp
+ret
+
+Ebp frames :
+----------
+
+The epilog does "add esp, LOCALS_SIZE" instead of "mov ebp, esp".
+Everything else is similar, though in a different order.
+
+The security object will no longer be at a fixed offset. However, the
+offset can still be determined by looking up the GC-info and determining
+how many callee-saved registers are pushed.
+
+push ebp
+mov ebp, esp
+push esi
+push edi
+push ebx
+sub esp, LOCALS_SIZE / push dummyReg if LOCALS_SIZE=sizeof(void*)
+...
+add esp, LOCALS_SIZE / pop dummyReg
+pop ebx
+pop edi
+pop esi
+(mov esp, ebp if there are no callee-saved registers)
+pop ebp
+ret
+
+Double-aligned frame :
+--------------------
+
+LOCALS_SIZE_ADJUSTED needs to include an unused DWORD if an odd number
+of callee-saved registers are pushed on the stack so that the locals
+themselves are qword-aligned. The instructions are the same as today,
+just in a different order.
+
+push ebp
+mov ebp, esp
+and esp, 0xFFFFFFFC
+push esi
+push edi
+push ebx
+sub esp, LOCALS_SIZE_ADJUSTED / push dummyReg if LOCALS_SIZE=sizeof(void*)
+...
+add esp, LOCALS_SIZE_ADJUSTED / pop dummyReg
+pop ebx
+pop edi
+pop esi
+pop ebp
+mov esp, ebp
+pop ebp
+ret
+
+localloc (with ebp) frames :
+--------------------------
+
+The instructions are the same as today, just in a different order.
+Also, today the epilog does "lea esp, [ebp-LOCALS_SIZE-calleeSavedRegsPushedSize]"
+which will change to "lea esp, [ebp-calleeSavedRegsPushedSize]".
+
+push ebp
+mov ebp, esp
+push esi
+push edi
+push ebx
+sub esp, LOCALS_SIZE / push dummyReg if LOCALS_SIZE=sizeof(void*)
+...
+lea esp, [ebp-calleeSavedRegsPushedSize]
+pop ebx
+pop edi
+pop esi
+(mov esp, ebp if there are no callee-saved registers)
+pop ebp
+ret
+
+*****************************************************************************/
+
+/*****************************************************************************
+ *
+ *  Generates appropriate NOP padding for a function prolog to support ReJIT.
+ */
+
+void CodeGen::genPrologPadForReJit()
+{
+    assert(compiler->compGeneratingProlog);
+
+#ifdef _TARGET_XARCH_
+    if (!(compiler->opts.eeFlags & CORJIT_FLG_PROF_REJIT_NOPS))
+    {
+        return;
+    }
+
+#if FEATURE_EH_FUNCLETS
+
+    // No need to generate pad (nops) for funclets.
+    // When compiling the main function (and not a funclet)
+    // the value of funCurrentFunc->funKind is equal to FUNC_ROOT.
+    if (compiler->funCurrentFunc()->funKind != FUNC_ROOT)
+    {
+        return;
+    }
+
+#endif // FEATURE_EH_FUNCLETS
+
+    unsigned size = getEmitter()->emitGetPrologOffsetEstimate();
+    if (size < 5)
+    {
+        instNop(5 - size);
+    }
+#endif
+}
+
+/*****************************************************************************
+ *
+ *  Reserve space for a function prolog.
+ */
+
+void CodeGen::genReserveProlog(BasicBlock* block)
+{
+    assert(block != nullptr);
+
+    JITDUMP("Reserving prolog IG for block BB%02u\n", block->bbNum);
+
+    /* Nothing is live on entry to the prolog */
+
+    getEmitter()->emitCreatePlaceholderIG(IGPT_PROLOG, block, VarSetOps::MakeEmpty(compiler), 0, 0, false);
+}
+
+/*****************************************************************************
+ *
+ *  Reserve space for a function epilog.
+ */
+
+void CodeGen::genReserveEpilog(BasicBlock* block)
+{
+    VARSET_TP VARSET_INIT(compiler, gcrefVarsArg, getEmitter()->emitThisGCrefVars);
+    regMaskTP gcrefRegsArg = gcInfo.gcRegGCrefSetCur;
+    regMaskTP byrefRegsArg = gcInfo.gcRegByrefSetCur;
+
+    /* The return value is special-cased: make sure it goes live for the epilog */
+
+    bool jmpEpilog = ((block->bbFlags & BBF_HAS_JMP) != 0);
+
+    if (genFullPtrRegMap && !jmpEpilog)
+    {
+        if (varTypeIsGC(compiler->info.compRetNativeType))
+        {
+            noway_assert(genTypeStSz(compiler->info.compRetNativeType) == genTypeStSz(TYP_I_IMPL));
+
+            gcInfo.gcMarkRegPtrVal(REG_INTRET, compiler->info.compRetNativeType);
+
+            switch (compiler->info.compRetNativeType)
+            {
+                case TYP_REF:
+                    gcrefRegsArg |= RBM_INTRET;
+                    break;
+                case TYP_BYREF:
+                    byrefRegsArg |= RBM_INTRET;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    JITDUMP("Reserving epilog IG for block BB%02u\n", block->bbNum);
+
+    assert(block != nullptr);
+    bool last = (block->bbNext == nullptr);
+    getEmitter()->emitCreatePlaceholderIG(IGPT_EPILOG, block, gcrefVarsArg, gcrefRegsArg, byrefRegsArg, last);
+}
+
+#if FEATURE_EH_FUNCLETS
+
+/*****************************************************************************
+ *
+ *  Reserve space for a funclet prolog.
+ */
+
+void CodeGen::genReserveFuncletProlog(BasicBlock* block)
+{
+    assert(block != nullptr);
+
+    /* Currently, no registers are live on entry to the prolog, except maybe
+       the exception object. There might be some live stack vars, but they
+       cannot be accessed until after the frame pointer is re-established.
+       In order to potentially prevent emitting a death before the prolog
+       and a birth right after it, we just report it as live during the
+       prolog, and rely on the prolog being non-interruptible. Trust
+       genCodeForBBlist to correctly initialize all the sets.
+
+       We might need to relax these asserts if the VM ever starts
+       restoring any registers, then we could have live-in reg vars...
+    */
+
+    noway_assert((gcInfo.gcRegGCrefSetCur & RBM_EXCEPTION_OBJECT) == gcInfo.gcRegGCrefSetCur);
+    noway_assert(gcInfo.gcRegByrefSetCur == 0);
+
+    JITDUMP("Reserving funclet prolog IG for block BB%02u\n", block->bbNum);
+
+    getEmitter()->emitCreatePlaceholderIG(IGPT_FUNCLET_PROLOG, block, gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur,
+                                          gcInfo.gcRegByrefSetCur, false);
+}
+
+/*****************************************************************************
+ *
+ *  Reserve space for a funclet epilog.
+ */
+
+void CodeGen::genReserveFuncletEpilog(BasicBlock* block)
+{
+    assert(block != nullptr);
+
+    JITDUMP("Reserving funclet epilog IG for block BB%02u\n", block->bbNum);
+
+    bool last = (block->bbNext == nullptr);
+    getEmitter()->emitCreatePlaceholderIG(IGPT_FUNCLET_EPILOG, block, gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur,
+                                          gcInfo.gcRegByrefSetCur, last);
+}
+
+#endif // FEATURE_EH_FUNCLETS
+
+/*****************************************************************************
+ *  Finalize the frame size and offset assignments.
+ *
+ *  No changes can be made to the modified register set after this, since that can affect how many
+ *  callee-saved registers get saved.
+ */
+void CodeGen::genFinalizeFrame()
+{
+    JITDUMP("Finalizing stack frame\n");
+
+#ifndef LEGACY_BACKEND
+    // Initializations need to happen based on the var locations at the start
+    // of the first basic block, so load those up. In particular, the determination
+    // of whether or not to use block init in the prolog is dependent on the variable
+    // locations on entry to the function.
+    compiler->m_pLinearScan->recordVarLocationsAtStartOfBB(compiler->fgFirstBB);
+#endif // !LEGACY_BACKEND
+
+    genCheckUseBlockInit();
+
+    // Set various registers as "modified" for special code generation scenarios: Edit & Continue, P/Invoke calls, etc.
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#if defined(_TARGET_X86_)
+
+    if (compiler->compTailCallUsed)
+    {
+        // If we are generating a helper-based tailcall, we've set the tailcall helper "flags"
+        // argument to "1", indicating to the tailcall helper that we've saved the callee-saved
+        // registers (ebx, esi, edi). So, we need to make sure all the callee-saved registers
+        // actually get saved.
+
+        regSet.rsSetRegsModified(RBM_INT_CALLEE_SAVED);
+    }
+#endif // _TARGET_X86_
+
+#if defined(_TARGET_ARMARCH_)
+    // We need to determine if we will change SP larger than a specific amount to determine if we want to use a loop
+    // to touch stack pages, that will require multiple registers. See genAllocLclFrame() for details.
+    if (compiler->compLclFrameSize >= compiler->getVeryLargeFrameSize())
+    {
+        regSet.rsSetRegsModified(VERY_LARGE_FRAME_SIZE_REG_MASK);
+    }
+#endif // defined(_TARGET_ARMARCH_)
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("Modified regs: ");
+        dspRegMask(regSet.rsGetModifiedRegsMask());
+        printf("\n");
+    }
+#endif // DEBUG
+
+    // Set various registers as "modified" for special code generation scenarios: Edit & Continue, P/Invoke calls, etc.
+    if (compiler->opts.compDbgEnC)
+    {
+        // We always save FP.
+        noway_assert(isFramePointerUsed());
+#ifdef _TARGET_AMD64_
+        // On x64 we always save exactly RBP, RSI and RDI for EnC.
+        regMaskTP okRegs = (RBM_CALLEE_TRASH | RBM_FPBASE | RBM_RSI | RBM_RDI);
+        regSet.rsSetRegsModified(RBM_RSI | RBM_RDI);
+        noway_assert((regSet.rsGetModifiedRegsMask() & ~okRegs) == 0);
+#else  // !_TARGET_AMD64_
+        // On x86 we save all callee saved regs so the saved reg area size is consistent
+        regSet.rsSetRegsModified(RBM_INT_CALLEE_SAVED & ~RBM_FPBASE);
+#endif // !_TARGET_AMD64_
+    }
+
+    /* If we have any pinvoke calls, we might potentially trash everything */
+    if (compiler->info.compCallUnmanaged)
+    {
+        noway_assert(isFramePointerUsed()); // Setup of Pinvoke frame currently requires an EBP style frame
+        regSet.rsSetRegsModified(RBM_INT_CALLEE_SAVED & ~RBM_FPBASE);
+    }
+
+    /* Count how many callee-saved registers will actually be saved (pushed) */
+
+    // EBP cannot be (directly) modified for EBP frame and double-aligned frames
+    noway_assert(!doubleAlignOrFramePointerUsed() || !regSet.rsRegsModified(RBM_FPBASE));
+
+#if ETW_EBP_FRAMED
+    // EBP cannot be (directly) modified
+    noway_assert(!regSet.rsRegsModified(RBM_FPBASE));
+#endif
+
+    regMaskTP maskCalleeRegsPushed = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED;
+
+#ifdef _TARGET_ARMARCH_
+    if (isFramePointerUsed())
+    {
+        // For a FP based frame we have to push/pop the FP register
+        //
+        maskCalleeRegsPushed |= RBM_FPBASE;
+
+        // This assert check that we are not using REG_FP
+        // as both the frame pointer and as a codegen register
+        //
+        assert(!regSet.rsRegsModified(RBM_FPBASE));
+    }
+
+    // we always push LR.  See genPushCalleeSavedRegisters
+    //
+    maskCalleeRegsPushed |= RBM_LR;
+
+#if defined(_TARGET_ARM_)
+    // TODO-ARM64-Bug?: enable some variant of this for FP on ARM64?
+    regMaskTP maskPushRegsFloat = maskCalleeRegsPushed & RBM_ALLFLOAT;
+    regMaskTP maskPushRegsInt   = maskCalleeRegsPushed & ~maskPushRegsFloat;
+
+    if ((maskPushRegsFloat != RBM_NONE) ||
+        (compiler->opts.MinOpts() && (regSet.rsMaskResvd & maskCalleeRegsPushed & RBM_OPT_RSVD)))
+    {
+        // Here we try to keep stack double-aligned before the vpush
+        if ((genCountBits(regSet.rsMaskPreSpillRegs(true) | maskPushRegsInt) % 2) != 0)
+        {
+            regNumber extraPushedReg = REG_R4;
+            while (maskPushRegsInt & genRegMask(extraPushedReg))
+            {
+                extraPushedReg = REG_NEXT(extraPushedReg);
+            }
+            if (extraPushedReg < REG_R11)
+            {
+                maskPushRegsInt |= genRegMask(extraPushedReg);
+                regSet.rsSetRegsModified(genRegMask(extraPushedReg));
+            }
+        }
+        maskCalleeRegsPushed = maskPushRegsInt | maskPushRegsFloat;
+    }
+
+    // We currently only expect to push/pop consecutive FP registers
+    // and these have to be double-sized registers as well.
+    // Here we will insure that maskPushRegsFloat obeys these requirements.
+    //
+    if (maskPushRegsFloat != RBM_NONE)
+    {
+        regMaskTP contiguousMask = genRegMaskFloat(REG_F16, TYP_DOUBLE);
+        while (maskPushRegsFloat > contiguousMask)
+        {
+            contiguousMask <<= 2;
+            contiguousMask |= genRegMaskFloat(REG_F16, TYP_DOUBLE);
+        }
+        if (maskPushRegsFloat != contiguousMask)
+        {
+            regMaskTP maskExtraRegs = contiguousMask - maskPushRegsFloat;
+            maskPushRegsFloat |= maskExtraRegs;
+            regSet.rsSetRegsModified(maskExtraRegs);
+            maskCalleeRegsPushed |= maskExtraRegs;
+        }
+    }
+#endif // _TARGET_ARM_
+#endif // _TARGET_ARMARCH_
+
+#if defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+    // Compute the count of callee saved float regs saved on stack.
+    // On Amd64 we push only integer regs. Callee saved float (xmm6-xmm15)
+    // regs are stack allocated and preserved in their stack locations.
+    compiler->compCalleeFPRegsSavedMask = maskCalleeRegsPushed & RBM_FLT_CALLEE_SAVED;
+    maskCalleeRegsPushed &= ~RBM_FLT_CALLEE_SAVED;
+#endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+
+    compiler->compCalleeRegsPushed = genCountBits(maskCalleeRegsPushed);
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("Callee-saved registers pushed: %d ", compiler->compCalleeRegsPushed);
+        dspRegMask(maskCalleeRegsPushed);
+        printf("\n");
+    }
+#endif // DEBUG
+
+    /* Assign the final offsets to things living on the stack frame */
+
+    compiler->lvaAssignFrameOffsets(Compiler::FINAL_FRAME_LAYOUT);
+
+    /* We want to make sure that the prolog size calculated here is accurate
+       (that is instructions will not shrink because of conservative stack
+       frame approximations).  We do this by filling in the correct size
+       here (where we have committed to the final numbers for the frame offsets)
+       This will ensure that the prolog size is always correct
+    */
+    getEmitter()->emitMaxTmpSize = compiler->tmpSize;
+
+#ifdef DEBUG
+    if (compiler->opts.dspCode || compiler->opts.disAsm || compiler->opts.disAsm2 || verbose)
+    {
+        compiler->lvaTableDump();
+    }
+#endif
+}
+
+//------------------------------------------------------------------------
+// genEstablishFramePointer: Set up the frame pointer by adding an offset to the stack pointer.
+//
+// Arguments:
+//    delta - the offset to add to the current stack pointer to establish the frame pointer
+//    reportUnwindData - true if establishing the frame pointer should be reported in the OS unwind data.
+
+void CodeGen::genEstablishFramePointer(int delta, bool reportUnwindData)
+{
+    assert(compiler->compGeneratingProlog);
+
+#if defined(_TARGET_XARCH_)
+
+    if (delta == 0)
+    {
+        getEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_FPBASE, REG_SPBASE);
+        psiMoveESPtoEBP();
+    }
+    else
+    {
+        getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_FPBASE, REG_SPBASE, delta);
+        // We don't update prolog scope info (there is no function to handle lea), but that is currently dead code
+        // anyway.
+    }
+
+    if (reportUnwindData)
+    {
+        compiler->unwindSetFrameReg(REG_FPBASE, delta);
+    }
+
+#elif defined(_TARGET_ARM_)
+
+    assert(arm_Valid_Imm_For_Add_SP(delta));
+    getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_FPBASE, REG_SPBASE, delta);
+
+    if (reportUnwindData)
+    {
+        compiler->unwindPadding();
+    }
+
+#else
+    NYI("establish frame pointer");
+#endif
+}
+
+/*****************************************************************************
+ *
+ *  Generates code for a function prolog.
+ *
+ *  NOTE REGARDING CHANGES THAT IMPACT THE DEBUGGER:
+ *
+ *  The debugger relies on decoding ARM instructions to be able to successfully step through code. It does not
+ *  implement decoding all ARM instructions. It only implements decoding the instructions which the JIT emits, and
+ *  only instructions which result in control not going to the next instruction. Basically, any time execution would
+ *  not continue at the next instruction (such as B, BL, BX, BLX, POP{pc}, etc.), the debugger has to be able to
+ *  decode that instruction. If any of this is changed on ARM, the debugger team needs to be notified so that it
+ *  can ensure stepping isn't broken. This is also a requirement for x86 and amd64.
+ *
+ *  If any changes are made in the prolog, epilog, calls, returns, and branches, it is a good idea to notify the
+ *  debugger team to ensure that stepping still works.
+ *
+ *  ARM stepping code is here: debug\ee\arm\armwalker.cpp, vm\arm\armsinglestepper.cpp.
+ */
+
+#ifdef _PREFAST_
+#pragma warning(push)
+#pragma warning(disable : 21000) // Suppress PREFast warning about overly large function
+#endif
+void CodeGen::genFnProlog()
+{
+    ScopedSetVariable<bool> _setGeneratingProlog(&compiler->compGeneratingProlog, true);
+
+    compiler->funSetCurrentFunc(0);
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** In genFnProlog()\n");
+    }
+#endif
+
+#ifdef DEBUG
+    genInterruptibleUsed = true;
+#endif
+
+#ifdef LEGACY_BACKEND
+    genFinalizeFrame();
+#endif // LEGACY_BACKEND
+
+    assert(compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT);
+
+    /* Ready to start on the prolog proper */
+
+    getEmitter()->emitBegProlog();
+    compiler->unwindBegProlog();
+
+#ifdef DEBUGGING_SUPPORT
+    // Do this so we can put the prolog instruction group ahead of
+    // other instruction groups
+    genIPmappingAddToFront((IL_OFFSETX)ICorDebugInfo::PROLOG);
+#endif // DEBUGGING_SUPPORT
+
+#ifdef DEBUG
+    if (compiler->opts.dspCode)
+    {
+        printf("\n__prolog:\n");
+    }
+#endif
+
+#ifdef DEBUGGING_SUPPORT
+    if (compiler->opts.compScopeInfo && (compiler->info.compVarScopesCount > 0))
+    {
+        // Create new scopes for the method-parameters for the prolog-block.
+        psiBegProlog();
+    }
+#endif
+
+#ifdef DEBUG
+
+    if (compiler->compJitHaltMethod())
+    {
+        /* put a nop first because the debugger and other tools are likely to
+           put an int3 at the begining and we don't want to confuse them */
+
+        instGen(INS_nop);
+        instGen(INS_BREAKPOINT);
+
+#ifdef _TARGET_ARMARCH_
+        // Avoid asserts in the unwind info because these instructions aren't accounted for.
+        compiler->unwindPadding();
+#endif // _TARGET_ARMARCH_
+    }
+#endif // DEBUG
+
+#if FEATURE_EH_FUNCLETS && defined(DEBUG)
+
+    // We cannot force 0-initialization of the PSPSym
+    // as it will overwrite the real value
+    if (compiler->lvaPSPSym != BAD_VAR_NUM)
+    {
+        LclVarDsc* varDsc = &compiler->lvaTable[compiler->lvaPSPSym];
+        assert(!varDsc->lvMustInit);
+    }
+
+#endif // FEATURE_EH_FUNCLETS && DEBUG
+
+    /*-------------------------------------------------------------------------
+     *
+     *  Record the stack frame ranges that will cover all of the tracked
+     *  and untracked pointer variables.
+     *  Also find which registers will need to be zero-initialized.
+     *
+     *  'initRegs': - Generally, enregistered variables should not need to be
+     *                zero-inited. They only need to be zero-inited when they
+     *                have a possibly uninitialized read on some control
+     *                flow path. Apparently some of the IL_STUBs that we
+     *                generate have this property.
+     */
+
+    int untrLclLo = +INT_MAX;
+    int untrLclHi = -INT_MAX;
+    // 'hasUntrLcl' is true if there are any stack locals which must be init'ed.
+    // Note that they may be tracked, but simply not allocated to a register.
+    bool hasUntrLcl = false;
+
+    int  GCrefLo  = +INT_MAX;
+    int  GCrefHi  = -INT_MAX;
+    bool hasGCRef = false;
+
+    regMaskTP initRegs    = RBM_NONE; // Registers which must be init'ed.
+    regMaskTP initFltRegs = RBM_NONE; // FP registers which must be init'ed.
+    regMaskTP initDblRegs = RBM_NONE;
+
+    unsigned   varNum;
+    LclVarDsc* varDsc;
+
+    for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->lvaCount; varNum++, varDsc++)
+    {
+        if (varDsc->lvIsParam && !varDsc->lvIsRegArg)
+        {
+            continue;
+        }
+
+        if (!varDsc->lvIsInReg() && !varDsc->lvOnFrame)
+        {
+            noway_assert(varDsc->lvRefCnt == 0);
+            continue;
+        }
+
+        signed int loOffs = varDsc->lvStkOffs;
+        signed int hiOffs = varDsc->lvStkOffs + compiler->lvaLclSize(varNum);
+
+        /* We need to know the offset range of tracked stack GC refs */
+        /* We assume that the GC reference can be anywhere in the TYP_STRUCT */
+
+        if (compiler->lvaTypeIsGC(varNum) && varDsc->lvTrackedNonStruct() && varDsc->lvOnFrame)
+        {
+            // For fields of PROMOTION_TYPE_DEPENDENT type of promotion, they should have been
+            // taken care of by the parent struct.
+            if (!compiler->lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+            {
+                hasGCRef = true;
+
+                if (loOffs < GCrefLo)
+                {
+                    GCrefLo = loOffs;
+                }
+                if (hiOffs > GCrefHi)
+                {
+                    GCrefHi = hiOffs;
+                }
+            }
+        }
+
+        /* For lvMustInit vars, gather pertinent info */
+
+        if (!varDsc->lvMustInit)
+        {
+            continue;
+        }
+
+        if (varDsc->lvIsInReg())
+        {
+            regMaskTP regMask = genRegMask(varDsc->lvRegNum);
+            if (!varDsc->IsFloatRegType())
+            {
+                initRegs |= regMask;
+
+                if (varTypeIsMultiReg(varDsc))
+                {
+                    if (varDsc->lvOtherReg != REG_STK)
+                    {
+                        initRegs |= genRegMask(varDsc->lvOtherReg);
+                    }
+                    else
+                    {
+                        /* Upper DWORD is on the stack, and needs to be inited */
+
+                        loOffs += sizeof(int);
+                        goto INIT_STK;
+                    }
+                }
+            }
+#if !FEATURE_STACK_FP_X87
+            else if (varDsc->TypeGet() == TYP_DOUBLE)
+            {
+                initDblRegs |= regMask;
+            }
+            else
+            {
+                initFltRegs |= regMask;
+            }
+#endif // !FEATURE_STACK_FP_X87
+        }
+        else
+        {
+        INIT_STK:
+
+            hasUntrLcl = true;
+
+            if (loOffs < untrLclLo)
+            {
+                untrLclLo = loOffs;
+            }
+            if (hiOffs > untrLclHi)
+            {
+                untrLclHi = hiOffs;
+            }
+        }
+    }
+
+    /* Don't forget about spill temps that hold pointers */
+
+    if (!TRACK_GC_TEMP_LIFETIMES)
+    {
+        assert(compiler->tmpAllFree());
+        for (TempDsc* tempThis = compiler->tmpListBeg(); tempThis != nullptr; tempThis = compiler->tmpListNxt(tempThis))
+        {
+            if (!varTypeIsGC(tempThis->tdTempType()))
+            {
+                continue;
+            }
+
+            signed int loOffs = tempThis->tdTempOffs();
+            signed int hiOffs = loOffs + TARGET_POINTER_SIZE;
+
+            // If there is a frame pointer used, due to frame pointer chaining it will point to the stored value of the
+            // previous frame pointer. Thus, stkOffs can't be zero.
+            CLANG_FORMAT_COMMENT_ANCHOR;
+
+#if !defined(_TARGET_AMD64_)
+            // However, on amd64 there is no requirement to chain frame pointers.
+
+            noway_assert(!isFramePointerUsed() || loOffs != 0);
+#endif // !defined(_TARGET_AMD64_)
+            // printf("    Untracked tmp at [EBP-%04X]\n", -stkOffs);
+
+            hasUntrLcl = true;
+
+            if (loOffs < untrLclLo)
+            {
+                untrLclLo = loOffs;
+            }
+            if (hiOffs > untrLclHi)
+            {
+                untrLclHi = hiOffs;
+            }
+        }
+    }
+
+    assert((genInitStkLclCnt > 0) == hasUntrLcl);
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        if (genInitStkLclCnt > 0)
+        {
+            printf("Found %u lvMustInit stk vars, frame offsets %d through %d\n", genInitStkLclCnt, -untrLclLo,
+                   -untrLclHi);
+        }
+    }
+#endif
+
+#ifdef _TARGET_ARM_
+    // On the ARM we will spill any incoming struct args in the first instruction in the prolog
+    // Ditto for all enregistered user arguments in a varargs method.
+    // These registers will be available to use for the initReg.  We just remove
+    // all of these registers from the rsCalleeRegArgMaskLiveIn.
+    //
+    intRegState.rsCalleeRegArgMaskLiveIn &= ~regSet.rsMaskPreSpillRegs(false);
+#endif
+
+    /* Choose the register to use for zero initialization */
+
+    regNumber initReg       = REG_SCRATCH; // Unless we find a better register below
+    bool      initRegZeroed = false;
+    regMaskTP excludeMask   = intRegState.rsCalleeRegArgMaskLiveIn;
+    regMaskTP tempMask;
+
+    // We should not use the special PINVOKE registers as the initReg
+    // since they are trashed by the jithelper call to setup the PINVOKE frame
+    if (compiler->info.compCallUnmanaged)
+    {
+        excludeMask |= RBM_PINVOKE_FRAME;
+
+        assert((!compiler->opts.ShouldUsePInvokeHelpers()) || (compiler->info.compLvFrameListRoot == BAD_VAR_NUM));
+        if (!compiler->opts.ShouldUsePInvokeHelpers())
+        {
+            noway_assert(compiler->info.compLvFrameListRoot < compiler->lvaCount);
+
+            excludeMask |= (RBM_PINVOKE_TCB | RBM_PINVOKE_SCRATCH);
+
+            // We also must exclude the register used by compLvFrameListRoot when it is enregistered
+            //
+            LclVarDsc* varDsc = &compiler->lvaTable[compiler->info.compLvFrameListRoot];
+            if (varDsc->lvRegister)
+            {
+                excludeMask |= genRegMask(varDsc->lvRegNum);
+            }
+        }
+    }
+
+#ifdef _TARGET_ARM_
+    // If we have a variable sized frame (compLocallocUsed is true)
+    // then using REG_SAVED_LOCALLOC_SP in the prolog is not allowed
+    if (compiler->compLocallocUsed)
+    {
+        excludeMask |= RBM_SAVED_LOCALLOC_SP;
+    }
+#endif // _TARGET_ARM_
+
+#if defined(_TARGET_XARCH_)
+    if (compiler->compLclFrameSize >= compiler->getVeryLargeFrameSize())
+    {
+        // We currently must use REG_EAX on x86 here
+        // because the loop's backwards branch depends upon the size of EAX encodings
+        assert(initReg == REG_EAX);
+    }
+    else
+#endif // _TARGET_XARCH_
+    {
+        tempMask = initRegs & ~excludeMask & ~regSet.rsMaskResvd;
+
+        if (tempMask != RBM_NONE)
+        {
+            // We will use one of the registers that we were planning to zero init anyway.
+            // We pick the lowest register number.
+            tempMask = genFindLowestBit(tempMask);
+            initReg  = genRegNumFromMask(tempMask);
+        }
+        // Next we prefer to use one of the unused argument registers.
+        // If they aren't available we use one of the caller-saved integer registers.
+        else
+        {
+            tempMask = regSet.rsGetModifiedRegsMask() & RBM_ALLINT & ~excludeMask & ~regSet.rsMaskResvd;
+            if (tempMask != RBM_NONE)
+            {
+                // We pick the lowest register number
+                tempMask = genFindLowestBit(tempMask);
+                initReg  = genRegNumFromMask(tempMask);
+            }
+        }
+    }
+
+    noway_assert(!compiler->info.compCallUnmanaged || (initReg != REG_PINVOKE_FRAME));
+
+#if defined(_TARGET_AMD64_)
+    // If we are a varargs call, in order to set up the arguments correctly this
+    // must be done in a 2 step process. As per the x64 ABI:
+    // a) The caller sets up the argument shadow space (just before the return
+    //    address, 4 pointer sized slots).
+    // b) The callee is responsible to home the arguments on the shadow space
+    //    provided by the caller.
+    // This way, the varargs iterator will be able to retrieve the
+    // call arguments properly since both the arg regs and the stack allocated
+    // args will be contiguous.
+    if (compiler->info.compIsVarArgs)
+    {
+        getEmitter()->spillIntArgRegsToShadowSlots();
+    }
+
+#endif // _TARGET_AMD64_
+
+#ifdef _TARGET_ARM_
+    /*-------------------------------------------------------------------------
+     *
+     * Now start emitting the part of the prolog which sets up the frame
+     */
+
+    if (regSet.rsMaskPreSpillRegs(true) != RBM_NONE)
+    {
+        inst_IV(INS_push, (int)regSet.rsMaskPreSpillRegs(true));
+        compiler->unwindPushMaskInt(regSet.rsMaskPreSpillRegs(true));
+    }
+#endif // _TARGET_ARM_
+
+#ifdef _TARGET_XARCH_
+    if (doubleAlignOrFramePointerUsed())
+    {
+        inst_RV(INS_push, REG_FPBASE, TYP_REF);
+        compiler->unwindPush(REG_FPBASE);
+        psiAdjustStackLevel(REGSIZE_BYTES);
+
+#ifndef _TARGET_AMD64_ // On AMD64, establish the frame pointer after the "sub rsp"
+        genEstablishFramePointer(0, /*reportUnwindData*/ true);
+#endif // !_TARGET_AMD64_
+
+#if DOUBLE_ALIGN
+        if (compiler->genDoubleAlign())
+        {
+            noway_assert(isFramePointerUsed() == false);
+            noway_assert(!regSet.rsRegsModified(RBM_FPBASE)); /* Trashing EBP is out.    */
+
+            inst_RV_IV(INS_AND, REG_SPBASE, -8, EA_PTRSIZE);
+        }
+#endif // DOUBLE_ALIGN
+    }
+#endif // _TARGET_XARCH_
+
+#ifdef _TARGET_ARM64_
+    // Probe large frames now, if necessary, since genPushCalleeSavedRegisters() will allocate the frame.
+    genAllocLclFrame(compiler->compLclFrameSize, initReg, &initRegZeroed, intRegState.rsCalleeRegArgMaskLiveIn);
+    genPushCalleeSavedRegisters(initReg, &initRegZeroed);
+#else  // !_TARGET_ARM64_
+    genPushCalleeSavedRegisters();
+#endif // !_TARGET_ARM64_
+
+#ifdef _TARGET_ARM_
+    bool needToEstablishFP        = false;
+    int  afterLclFrameSPtoFPdelta = 0;
+    if (doubleAlignOrFramePointerUsed())
+    {
+        needToEstablishFP = true;
+
+        // If the local frame is small enough, we establish the frame pointer after the OS-reported prolog.
+        // This makes the prolog and epilog match, giving us smaller unwind data. If the frame size is
+        // too big, we go ahead and do it here.
+
+        int SPtoFPdelta          = (compiler->compCalleeRegsPushed - 2) * REGSIZE_BYTES;
+        afterLclFrameSPtoFPdelta = SPtoFPdelta + compiler->compLclFrameSize;
+        if (!arm_Valid_Imm_For_Add_SP(afterLclFrameSPtoFPdelta))
+        {
+            // Oh well, it looks too big. Go ahead and establish the frame pointer here.
+            genEstablishFramePointer(SPtoFPdelta, /*reportUnwindData*/ true);
+            needToEstablishFP = false;
+        }
+    }
+#endif // _TARGET_ARM_
+
+    //-------------------------------------------------------------------------
+    //
+    // Subtract the local frame size from SP.
+    //
+    //-------------------------------------------------------------------------
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifndef _TARGET_ARM64_
+    regMaskTP maskStackAlloc = RBM_NONE;
+
+#ifdef _TARGET_ARM_
+    maskStackAlloc =
+        genStackAllocRegisterMask(compiler->compLclFrameSize, regSet.rsGetModifiedRegsMask() & RBM_FLT_CALLEE_SAVED);
+#endif // _TARGET_ARM_
+
+    if (maskStackAlloc == RBM_NONE)
+    {
+        genAllocLclFrame(compiler->compLclFrameSize, initReg, &initRegZeroed, intRegState.rsCalleeRegArgMaskLiveIn);
+    }
+#endif // !_TARGET_ARM64_
+
+//-------------------------------------------------------------------------
+
+#ifdef _TARGET_ARM_
+    if (compiler->compLocallocUsed)
+    {
+        getEmitter()->emitIns_R_R(INS_mov, EA_4BYTE, REG_SAVED_LOCALLOC_SP, REG_SPBASE);
+        regTracker.rsTrackRegTrash(REG_SAVED_LOCALLOC_SP);
+        compiler->unwindSetFrameReg(REG_SAVED_LOCALLOC_SP, 0);
+    }
+#endif // _TARGET_ARMARCH_
+
+#if defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+    // Preserve callee saved float regs to stack.
+    genPreserveCalleeSavedFltRegs(compiler->compLclFrameSize);
+#endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+
+#ifdef _TARGET_AMD64_
+    // Establish the AMD64 frame pointer after the OS-reported prolog.
+    if (doubleAlignOrFramePointerUsed())
+    {
+        bool reportUnwindData = compiler->compLocallocUsed || compiler->opts.compDbgEnC;
+        genEstablishFramePointer(compiler->codeGen->genSPtoFPdelta(), reportUnwindData);
+    }
+#endif //_TARGET_AMD64_
+
+//-------------------------------------------------------------------------
+//
+// This is the end of the OS-reported prolog for purposes of unwinding
+//
+//-------------------------------------------------------------------------
+
+#ifdef _TARGET_ARM_
+    if (needToEstablishFP)
+    {
+        genEstablishFramePointer(afterLclFrameSPtoFPdelta, /*reportUnwindData*/ false);
+        needToEstablishFP = false; // nobody uses this later, but set it anyway, just to be explicit
+    }
+#endif // _TARGET_ARM_
+
+    if (compiler->info.compPublishStubParam)
+    {
+#if CPU_LOAD_STORE_ARCH
+        getEmitter()->emitIns_R_R_I(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SECRET_STUB_PARAM, genFramePointerReg(),
+                                    compiler->lvaTable[compiler->lvaStubArgumentVar].lvStkOffs);
+#else
+        // mov [lvaStubArgumentVar], EAX
+        getEmitter()->emitIns_AR_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SECRET_STUB_PARAM, genFramePointerReg(),
+                                   compiler->lvaTable[compiler->lvaStubArgumentVar].lvStkOffs);
+#endif
+        assert(intRegState.rsCalleeRegArgMaskLiveIn & RBM_SECRET_STUB_PARAM);
+
+        // It's no longer live; clear it out so it can be used after this in the prolog
+        intRegState.rsCalleeRegArgMaskLiveIn &= ~RBM_SECRET_STUB_PARAM;
+    }
+
+#if STACK_PROBES
+    // We could probably fold this into the loop for the FrameSize >= 0x3000 probing
+    // when creating the stack frame. Don't think it's worth it, though.
+    if (genNeedPrologStackProbe)
+    {
+        //
+        // Can't have a call until we have enough padding for rejit
+        //
+        genPrologPadForReJit();
+        noway_assert(compiler->opts.compNeedStackProbes);
+        genGenerateStackProbe();
+        compiler->compStackProbePrologDone = true;
+    }
+#endif // STACK_PROBES
+
+    //
+    // Zero out the frame as needed
+    //
+
+    genZeroInitFrame(untrLclHi, untrLclLo, initReg, &initRegZeroed);
+
+#if FEATURE_EH_FUNCLETS
+
+    genSetPSPSym(initReg, &initRegZeroed);
+
+#else // !FEATURE_EH_FUNCLETS
+
+    // when compInitMem is true the genZeroInitFrame will zero out the shadow SP slots
+    if (compiler->ehNeedsShadowSPslots() && !compiler->info.compInitMem)
+    {
+        /*
+        // size/speed option?
+        getEmitter()->emitIns_I_ARR(INS_mov, EA_PTRSIZE, 0,
+                                REG_EBP, REG_NA, -compiler->lvaShadowSPfirstOffs);
+        */
+
+        // The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
+        unsigned filterEndOffsetSlotOffs = compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - (sizeof(void*));
+
+        // Zero out the slot for nesting level 0
+        unsigned firstSlotOffs = filterEndOffsetSlotOffs - (sizeof(void*));
+
+        if (!initRegZeroed)
+        {
+            instGen_Set_Reg_To_Zero(EA_PTRSIZE, initReg);
+            initRegZeroed = true;
+        }
+
+        getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, initReg, compiler->lvaShadowSPslotsVar,
+                                  firstSlotOffs);
+    }
+
+#endif // !FEATURE_EH_FUNCLETS
+
+    genReportGenericContextArg(initReg, &initRegZeroed);
+
+#if defined(LEGACY_BACKEND) // in RyuJIT backend this has already been expanded into trees
+    if (compiler->info.compCallUnmanaged)
+    {
+        getEmitter()->emitDisableRandomNops();
+        initRegs = genPInvokeMethodProlog(initRegs);
+        getEmitter()->emitEnableRandomNops();
+    }
+#endif // defined(LEGACY_BACKEND)
+
+    // The local variable representing the security object must be on the stack frame
+    // and must be 0 initialized.
+    noway_assert((compiler->lvaSecurityObject == BAD_VAR_NUM) ||
+                 (compiler->lvaTable[compiler->lvaSecurityObject].lvOnFrame &&
+                  compiler->lvaTable[compiler->lvaSecurityObject].lvMustInit));
+
+    // Initialize any "hidden" slots/locals
+
+    if (compiler->compLocallocUsed)
+    {
+        noway_assert(compiler->lvaLocAllocSPvar != BAD_VAR_NUM);
+#ifdef _TARGET_ARM64_
+        getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_FPBASE, compiler->lvaLocAllocSPvar, 0);
+#else
+        getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaLocAllocSPvar, 0);
+#endif
+    }
+
+    // Set up the GS security cookie
+
+    genSetGSSecurityCookie(initReg, &initRegZeroed);
+
+#ifdef PROFILING_SUPPORTED
+
+    // Insert a function entry callback for profiling, if requested.
+    genProfilingEnterCallback(initReg, &initRegZeroed);
+
+#endif // PROFILING_SUPPORTED
+
+    if (!genInterruptible)
+    {
+        /*-------------------------------------------------------------------------
+         *
+         * The 'real' prolog ends here for non-interruptible methods.
+         * For fully-interruptible methods, we extend the prolog so that
+         * we do not need to track GC inforation while shuffling the
+         * arguments.
+         *
+         * Make sure there's enough padding for ReJIT.
+         *
+         */
+        genPrologPadForReJit();
+        getEmitter()->emitMarkPrologEnd();
+    }
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) && defined(FEATURE_SIMD)
+    // The unused bits of Vector3 arguments must be cleared
+    // since native compiler doesn't initize the upper bits to zeros.
+    //
+    // TODO-Cleanup: This logic can be implemented in
+    // genFnPrologCalleeRegArgs() for argument registers and
+    // genEnregisterIncomingStackArgs() for stack arguments.
+    genClearStackVec3ArgUpperBits();
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING && FEATURE_SIMD
+
+    /*-----------------------------------------------------------------------------
+     * Take care of register arguments first
+     */
+
+    RegState* regState;
+
+#ifndef LEGACY_BACKEND
+    // Update the arg initial register locations.
+    compiler->lvaUpdateArgsWithInitialReg();
+#endif // !LEGACY_BACKEND
+
+    FOREACH_REGISTER_FILE(regState)
+    {
+        if (regState->rsCalleeRegArgMaskLiveIn)
+        {
+            // If we need an extra register to shuffle around the incoming registers
+            // we will use xtraReg (initReg) and set the xtraRegClobbered flag,
+            // if we don't need to use the xtraReg then this flag will stay false
+            //
+            regNumber xtraReg;
+            bool      xtraRegClobbered = false;
+
+            if (genRegMask(initReg) & RBM_ARG_REGS)
+            {
+                xtraReg = initReg;
+            }
+            else
+            {
+                xtraReg       = REG_SCRATCH;
+                initRegZeroed = false;
+            }
+
+            genFnPrologCalleeRegArgs(xtraReg, &xtraRegClobbered, regState);
+
+            if (xtraRegClobbered)
+            {
+                initRegZeroed = false;
+            }
+        }
+    }
+
+    // Home the incoming arguments
+    genEnregisterIncomingStackArgs();
+
+    /* Initialize any must-init registers variables now */
+
+    if (initRegs)
+    {
+        regMaskTP regMask = 0x1;
+
+        for (regNumber reg = REG_INT_FIRST; reg <= REG_INT_LAST; reg = REG_NEXT(reg), regMask <<= 1)
+        {
+            if (regMask & initRegs)
+            {
+                // Check if we have already zeroed this register
+                if ((reg == initReg) && initRegZeroed)
+                {
+                    continue;
+                }
+                else
+                {
+                    instGen_Set_Reg_To_Zero(EA_PTRSIZE, reg);
+                    if (reg == initReg)
+                    {
+                        initRegZeroed = true;
+                    }
+                }
+            }
+        }
+    }
+
+#if !FEATURE_STACK_FP_X87
+    if (initFltRegs | initDblRegs)
+    {
+        // If initReg is not in initRegs then we will use REG_SCRATCH
+        if ((genRegMask(initReg) & initRegs) == 0)
+        {
+            initReg       = REG_SCRATCH;
+            initRegZeroed = false;
+        }
+
+#ifdef _TARGET_ARM_
+        // This is needed only for Arm since it can use a zero initialized int register
+        // to initialize vfp registers.
+        if (!initRegZeroed)
+        {
+            instGen_Set_Reg_To_Zero(EA_PTRSIZE, initReg);
+            initRegZeroed = true;
+        }
+#endif // _TARGET_ARM_
+
+        genZeroInitFltRegs(initFltRegs, initDblRegs, initReg);
+    }
+#endif // !FEATURE_STACK_FP_X87
+
+#if FEATURE_STACK_FP_X87
+    //
+    // Here is where we load the enregistered floating point arguments
+    //   and locals onto the x86-FPU.
+    //
+    genCodeForPrologStackFP();
+#endif
+
+    //-----------------------------------------------------------------------------
+
+    //
+    // Increase the prolog size here only if fully interruptible.
+    // And again make sure it's big enough for ReJIT
+    //
+
+    if (genInterruptible)
+    {
+        genPrologPadForReJit();
+        getEmitter()->emitMarkPrologEnd();
+    }
+
+#ifdef DEBUGGING_SUPPORT
+    if (compiler->opts.compScopeInfo && (compiler->info.compVarScopesCount > 0))
+    {
+        psiEndProlog();
+    }
+#endif
+
+    if (hasGCRef)
+    {
+        getEmitter()->emitSetFrameRangeGCRs(GCrefLo, GCrefHi);
+    }
+    else
+    {
+        noway_assert(GCrefLo == +INT_MAX);
+        noway_assert(GCrefHi == -INT_MAX);
+    }
+
+#ifdef DEBUG
+    if (compiler->opts.dspCode)
+    {
+        printf("\n");
+    }
+#endif
+
+#ifdef _TARGET_X86_
+    // On non-x86 the VARARG cookie does not need any special treatment.
+
+    // Load up the VARARG argument pointer register so it doesn't get clobbered.
+    // only do this if we actually access any statically declared args
+    // (our argument pointer register has a refcount > 0).
+    unsigned argsStartVar = compiler->lvaVarargsBaseOfStkArgs;
+
+    if (compiler->info.compIsVarArgs && compiler->lvaTable[argsStartVar].lvRefCnt > 0)
+    {
+        varDsc = &compiler->lvaTable[argsStartVar];
+
+        noway_assert(compiler->info.compArgsCount > 0);
+
+        // MOV EAX, <VARARGS HANDLE>
+        getEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_EAX, compiler->info.compArgsCount - 1, 0);
+        regTracker.rsTrackRegTrash(REG_EAX);
+
+        // MOV EAX, [EAX]
+        getEmitter()->emitIns_R_AR(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_EAX, REG_EAX, 0);
+
+        // EDX might actually be holding something here.  So make sure to only use EAX for this code
+        // sequence.
+
+        LclVarDsc* lastArg = &compiler->lvaTable[compiler->info.compArgsCount - 1];
+        noway_assert(!lastArg->lvRegister);
+        signed offset = lastArg->lvStkOffs;
+        assert(offset != BAD_STK_OFFS);
+        noway_assert(lastArg->lvFramePointerBased);
+
+        // LEA EAX, &<VARARGS HANDLE> + EAX
+        getEmitter()->emitIns_R_ARR(INS_lea, EA_PTRSIZE, REG_EAX, genFramePointerReg(), REG_EAX, offset);
+
+        if (varDsc->lvRegister)
+        {
+            if (varDsc->lvRegNum != REG_EAX)
+            {
+                getEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, varDsc->lvRegNum, REG_EAX);
+                regTracker.rsTrackRegTrash(varDsc->lvRegNum);
+            }
+        }
+        else
+        {
+            getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_EAX, argsStartVar, 0);
+        }
+    }
+
+#endif // _TARGET_X86_
+
+#ifdef DEBUG
+    if (compiler->opts.compStackCheckOnRet)
+    {
+        noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC &&
+                     compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister &&
+                     compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame);
+        getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnEspCheck, 0);
+    }
+#endif
+
+    getEmitter()->emitEndProlog();
+    compiler->unwindEndProlog();
+
+    noway_assert(getEmitter()->emitMaxTmpSize == compiler->tmpSize);
+}
+#ifdef _PREFAST_
+#pragma warning(pop)
+#endif
+
+/*****************************************************************************
+ *
+ *  Generates code for a function epilog.
+ *
+ *  Please consult the "debugger team notification" comment in genFnProlog().
+ */
+
+#if defined(_TARGET_ARM_)
+
+void CodeGen::genFnEpilog(BasicBlock* block)
+{
+#ifdef DEBUG
+    if (verbose)
+        printf("*************** In genFnEpilog()\n");
+#endif
+
+    ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
+
+    VarSetOps::Assign(compiler, gcInfo.gcVarPtrSetCur, getEmitter()->emitInitGCrefVars);
+    gcInfo.gcRegGCrefSetCur = getEmitter()->emitInitGCrefRegs;
+    gcInfo.gcRegByrefSetCur = getEmitter()->emitInitByrefRegs;
+
+#ifdef DEBUG
+    if (compiler->opts.dspCode)
+        printf("\n__epilog:\n");
+
+    if (verbose)
+    {
+        printf("gcVarPtrSetCur=%s ", VarSetOps::ToString(compiler, gcInfo.gcVarPtrSetCur));
+        dumpConvertedVarSet(compiler, gcInfo.gcVarPtrSetCur);
+        printf(", gcRegGCrefSetCur=");
+        printRegMaskInt(gcInfo.gcRegGCrefSetCur);
+        getEmitter()->emitDispRegSet(gcInfo.gcRegGCrefSetCur);
+        printf(", gcRegByrefSetCur=");
+        printRegMaskInt(gcInfo.gcRegByrefSetCur);
+        getEmitter()->emitDispRegSet(gcInfo.gcRegByrefSetCur);
+        printf("\n");
+    }
+#endif
+
+    bool jmpEpilog = ((block->bbFlags & BBF_HAS_JMP) != 0);
+
+    // We delay starting the unwind codes until we have an instruction which we know
+    // needs an unwind code. In particular, for large stack frames in methods without
+    // localloc, the sequence might look something like this:
+    //      movw    r3, 0x38e0
+    //      add     sp, r3
+    //      pop     {r4,r5,r6,r10,r11,pc}
+    // In this case, the "movw" should not be part of the unwind codes, since it will
+    // be a NOP, and it is a waste to start with a NOP. Note that calling unwindBegEpilog()
+    // also sets the current location as the beginning offset of the epilog, so every
+    // instruction afterwards needs an unwind code. In the case above, if you call
+    // unwindBegEpilog() before the "movw", then you must generate a NOP for the "movw".
+
+    bool unwindStarted = false;
+
+    // Tear down the stack frame
+
+    if (compiler->compLocallocUsed)
+    {
+        if (!unwindStarted)
+        {
+            compiler->unwindBegEpilog();
+            unwindStarted = true;
+        }
+
+        // mov R9 into SP
+        inst_RV_RV(INS_mov, REG_SP, REG_SAVED_LOCALLOC_SP);
+        compiler->unwindSetFrameReg(REG_SAVED_LOCALLOC_SP, 0);
+    }
+
+    if (jmpEpilog ||
+        genStackAllocRegisterMask(compiler->compLclFrameSize, regSet.rsGetModifiedRegsMask() & RBM_FLT_CALLEE_SAVED) ==
+            RBM_NONE)
+    {
+        genFreeLclFrame(compiler->compLclFrameSize, &unwindStarted, jmpEpilog);
+    }
+
+    if (!unwindStarted)
+    {
+        // If we haven't generated anything yet, we're certainly going to generate a "pop" next.
+        compiler->unwindBegEpilog();
+        unwindStarted = true;
+    }
+
+    genPopCalleeSavedRegisters(jmpEpilog);
+
+    if (regSet.rsMaskPreSpillRegs(true) != RBM_NONE)
+    {
+        // We better not have used a pop PC to return otherwise this will be unreachable code
+        noway_assert(!genUsedPopToReturn);
+
+        int preSpillRegArgSize = genCountBits(regSet.rsMaskPreSpillRegs(true)) * REGSIZE_BYTES;
+        inst_RV_IV(INS_add, REG_SPBASE, preSpillRegArgSize, EA_PTRSIZE);
+        compiler->unwindAllocStack(preSpillRegArgSize);
+    }
+
+    if (jmpEpilog)
+    {
+        noway_assert(block->bbJumpKind == BBJ_RETURN);
+        noway_assert(block->bbTreeList);
+
+        // We better not have used a pop PC to return otherwise this will be unreachable code
+        noway_assert(!genUsedPopToReturn);
+
+        /* figure out what jump we have */
+
+        GenTree* jmpNode = block->lastNode();
+        noway_assert(jmpNode->gtOper == GT_JMP);
+
+        CORINFO_METHOD_HANDLE methHnd = (CORINFO_METHOD_HANDLE)jmpNode->gtVal.gtVal1;
+
+        CORINFO_CONST_LOOKUP  addrInfo;
+        void*                 addr;
+        regNumber             indCallReg;
+        emitter::EmitCallType callType;
+
+        compiler->info.compCompHnd->getFunctionEntryPoint(methHnd, &addrInfo);
+        switch (addrInfo.accessType)
+        {
+            case IAT_VALUE:
+                if (arm_Valid_Imm_For_BL((ssize_t)addrInfo.addr))
+                {
+                    // Simple direct call
+                    callType   = emitter::EC_FUNC_TOKEN;
+                    addr       = addrInfo.addr;
+                    indCallReg = REG_NA;
+                    break;
+                }
+
+                // otherwise the target address doesn't fit in an immediate
+                // so we have to burn a register...
+                __fallthrough;
+
+            case IAT_PVALUE:
+                // Load the address into a register, load indirect and call  through a register
+                // We have to use R12 since we assume the argument registers are in use
+                callType   = emitter::EC_INDIR_R;
+                indCallReg = REG_R12;
+                addr       = NULL;
+                instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, indCallReg, (ssize_t)addrInfo.addr);
+                if (addrInfo.accessType == IAT_PVALUE)
+                {
+                    getEmitter()->emitIns_R_R_I(INS_ldr, EA_PTRSIZE, indCallReg, indCallReg, 0);
+                    regTracker.rsTrackRegTrash(indCallReg);
+                }
+                break;
+
+            case IAT_PPVALUE:
+            default:
+                NO_WAY("Unsupported JMP indirection");
+        }
+
+        /* Simply emit a jump to the methodHnd. This is similar to a call so we can use
+         * the same descriptor with some minor adjustments.
+         */
+
+        getEmitter()->emitIns_Call(callType, methHnd, INDEBUG_LDISASM_COMMA(nullptr) addr,
+                                   0,          // argSize
+                                   EA_UNKNOWN, // retSize
+                                   gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur,
+                                   BAD_IL_OFFSET, // IL offset
+                                   indCallReg,    // ireg
+                                   REG_NA,        // xreg
+                                   0,             // xmul
+                                   0,             // disp
+                                   true);         // isJump
+    }
+    else
+    {
+        if (!genUsedPopToReturn)
+        {
+            // If we did not use a pop to return, then we did a "pop {..., lr}" instead of "pop {..., pc}",
+            // so we need a "bx lr" instruction to return from the function.
+            inst_RV(INS_bx, REG_LR, TYP_I_IMPL);
+            compiler->unwindBranch16();
+        }
+    }
+
+    compiler->unwindEndEpilog();
+}
+
+#elif defined(_TARGET_ARM64_)
+
+void CodeGen::genFnEpilog(BasicBlock* block)
+{
+#ifdef DEBUG
+    if (verbose)
+        printf("*************** In genFnEpilog()\n");
+#endif
+
+    ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
+
+    VarSetOps::Assign(compiler, gcInfo.gcVarPtrSetCur, getEmitter()->emitInitGCrefVars);
+    gcInfo.gcRegGCrefSetCur = getEmitter()->emitInitGCrefRegs;
+    gcInfo.gcRegByrefSetCur = getEmitter()->emitInitByrefRegs;
+
+#ifdef DEBUG
+    if (compiler->opts.dspCode)
+        printf("\n__epilog:\n");
+
+    if (verbose)
+    {
+        printf("gcVarPtrSetCur=%s ", VarSetOps::ToString(compiler, gcInfo.gcVarPtrSetCur));
+        dumpConvertedVarSet(compiler, gcInfo.gcVarPtrSetCur);
+        printf(", gcRegGCrefSetCur=");
+        printRegMaskInt(gcInfo.gcRegGCrefSetCur);
+        getEmitter()->emitDispRegSet(gcInfo.gcRegGCrefSetCur);
+        printf(", gcRegByrefSetCur=");
+        printRegMaskInt(gcInfo.gcRegByrefSetCur);
+        getEmitter()->emitDispRegSet(gcInfo.gcRegByrefSetCur);
+        printf("\n");
+    }
+#endif
+
+    bool jmpEpilog = ((block->bbFlags & BBF_HAS_JMP) != 0);
+
+    compiler->unwindBegEpilog();
+
+    genPopCalleeSavedRegistersAndFreeLclFrame(jmpEpilog);
+
+    if (jmpEpilog)
+    {
+        noway_assert(block->bbJumpKind == BBJ_RETURN);
+        noway_assert(block->bbTreeList != nullptr);
+
+        // figure out what jump we have
+        GenTree* jmpNode = block->lastNode();
+#if !FEATURE_FASTTAILCALL
+        noway_assert(jmpNode->gtOper == GT_JMP);
+#else
+        // arm64
+        // If jmpNode is GT_JMP then gtNext must be null.
+        // If jmpNode is a fast tail call, gtNext need not be null since it could have embedded stmts.
+        noway_assert((jmpNode->gtOper != GT_JMP) || (jmpNode->gtNext == nullptr));
+
+        // Could either be a "jmp method" or "fast tail call" implemented as epilog+jmp
+        noway_assert((jmpNode->gtOper == GT_JMP) ||
+                     ((jmpNode->gtOper == GT_CALL) && jmpNode->AsCall()->IsFastTailCall()));
+
+        // The next block is associated with this "if" stmt
+        if (jmpNode->gtOper == GT_JMP)
+#endif
+        {
+            // Simply emit a jump to the methodHnd. This is similar to a call so we can use
+            // the same descriptor with some minor adjustments.
+            CORINFO_METHOD_HANDLE methHnd = (CORINFO_METHOD_HANDLE)jmpNode->gtVal.gtVal1;
+
+            CORINFO_CONST_LOOKUP addrInfo;
+            compiler->info.compCompHnd->getFunctionEntryPoint(methHnd, &addrInfo);
+            if (addrInfo.accessType != IAT_VALUE)
+            {
+                NYI_ARM64("Unsupported JMP indirection");
+            }
+
+            emitter::EmitCallType callType = emitter::EC_FUNC_TOKEN;
+
+            // Simply emit a jump to the methodHnd. This is similar to a call so we can use
+            // the same descriptor with some minor adjustments.
+            getEmitter()->emitIns_Call(callType, methHnd, INDEBUG_LDISASM_COMMA(nullptr) addrInfo.addr,
+                                       0,          // argSize
+                                       EA_UNKNOWN, // retSize
+                                       EA_UNKNOWN, // secondRetSize
+                                       gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur,
+                                       BAD_IL_OFFSET, REG_NA, REG_NA, 0, 0, /* iloffset, ireg, xreg, xmul, disp */
+                                       true);                               /* isJump */
+        }
+#if FEATURE_FASTTAILCALL
+        else
+        {
+            // Fast tail call.
+            // Call target = REG_IP0.
+            // https://github.com/dotnet/coreclr/issues/4827
+            // Do we need a special encoding for stack walker like rex.w prefix for x64?
+            getEmitter()->emitIns_R(INS_br, emitTypeSize(TYP_I_IMPL), REG_IP0);
+        }
+#endif // FEATURE_FASTTAILCALL
+    }
+    else
+    {
+        inst_RV(INS_ret, REG_LR, TYP_I_IMPL);
+        compiler->unwindReturn(REG_LR);
+    }
+
+    compiler->unwindEndEpilog();
+}
+
+#elif defined(_TARGET_XARCH_)
+
+void CodeGen::genFnEpilog(BasicBlock* block)
+{
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** In genFnEpilog()\n");
+    }
+#endif
+
+    ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
+
+    VarSetOps::Assign(compiler, gcInfo.gcVarPtrSetCur, getEmitter()->emitInitGCrefVars);
+    gcInfo.gcRegGCrefSetCur = getEmitter()->emitInitGCrefRegs;
+    gcInfo.gcRegByrefSetCur = getEmitter()->emitInitByrefRegs;
+
+    noway_assert(!compiler->opts.MinOpts() || isFramePointerUsed()); // FPO not allowed with minOpts
+
+#ifdef DEBUG
+    genInterruptibleUsed = true;
+#endif
+
+    bool jmpEpilog = ((block->bbFlags & BBF_HAS_JMP) != 0);
+
+#ifdef DEBUG
+    if (compiler->opts.dspCode)
+    {
+        printf("\n__epilog:\n");
+    }
+
+    if (verbose)
+    {
+        printf("gcVarPtrSetCur=%s ", VarSetOps::ToString(compiler, gcInfo.gcVarPtrSetCur));
+        dumpConvertedVarSet(compiler, gcInfo.gcVarPtrSetCur);
+        printf(", gcRegGCrefSetCur=");
+        printRegMaskInt(gcInfo.gcRegGCrefSetCur);
+        getEmitter()->emitDispRegSet(gcInfo.gcRegGCrefSetCur);
+        printf(", gcRegByrefSetCur=");
+        printRegMaskInt(gcInfo.gcRegByrefSetCur);
+        getEmitter()->emitDispRegSet(gcInfo.gcRegByrefSetCur);
+        printf("\n");
+    }
+#endif
+
+#if !FEATURE_STACK_FP_X87
+    // Restore float registers that were saved to stack before SP is modified.
+    genRestoreCalleeSavedFltRegs(compiler->compLclFrameSize);
+#endif // !FEATURE_STACK_FP_X87
+
+    /* Compute the size in bytes we've pushed/popped */
+
+    if (!doubleAlignOrFramePointerUsed())
+    {
+        // We have an ESP frame */
+
+        noway_assert(compiler->compLocallocUsed == false); // Only used with frame-pointer
+
+        /* Get rid of our local variables */
+
+        if (compiler->compLclFrameSize)
+        {
+#ifdef _TARGET_X86_
+            /* Add 'compiler->compLclFrameSize' to ESP */
+            /* Use pop ECX to increment ESP by 4, unless compiler->compJmpOpUsed is true */
+
+            if ((compiler->compLclFrameSize == sizeof(void*)) && !compiler->compJmpOpUsed)
+            {
+                inst_RV(INS_pop, REG_ECX, TYP_I_IMPL);
+                regTracker.rsTrackRegTrash(REG_ECX);
+            }
+            else
+#endif // _TARGET_X86
+            {
+                /* Add 'compiler->compLclFrameSize' to ESP */
+                /* Generate "add esp, <stack-size>" */
+                inst_RV_IV(INS_add, REG_SPBASE, compiler->compLclFrameSize, EA_PTRSIZE);
+            }
+        }
+
+        genPopCalleeSavedRegisters();
+    }
+    else
+    {
+        noway_assert(doubleAlignOrFramePointerUsed());
+
+        /* Tear down the stack frame */
+
+        bool needMovEspEbp = false;
+
+#if DOUBLE_ALIGN
+        if (compiler->genDoubleAlign())
+        {
+            //
+            // add esp, compLclFrameSize
+            //
+            // We need not do anything (except the "mov esp, ebp") if
+            // compiler->compCalleeRegsPushed==0. However, this is unlikely, and it
+            // also complicates the code manager. Hence, we ignore that case.
+
+            noway_assert(compiler->compLclFrameSize != 0);
+            inst_RV_IV(INS_add, REG_SPBASE, compiler->compLclFrameSize, EA_PTRSIZE);
+
+            needMovEspEbp = true;
+        }
+        else
+#endif // DOUBLE_ALIGN
+        {
+            bool needLea = false;
+
+            if (compiler->compLocallocUsed)
+            {
+                // ESP may be variable if a localloc was actually executed. Reset it.
+                //    lea esp, [ebp - compiler->compCalleeRegsPushed * REGSIZE_BYTES]
+
+                needLea = true;
+            }
+            else if (!regSet.rsRegsModified(RBM_CALLEE_SAVED))
+            {
+                if (compiler->compLclFrameSize != 0)
+                {
+#ifdef _TARGET_AMD64_
+                    // AMD64 can't use "mov esp, ebp", according to the ABI specification describing epilogs. So,
+                    // do an LEA to "pop off" the frame allocation.
+                    needLea = true;
+#else  // !_TARGET_AMD64_
+                    // We will just generate "mov esp, ebp" and be done with it.
+                    needMovEspEbp = true;
+#endif // !_TARGET_AMD64_
+                }
+            }
+            else if (compiler->compLclFrameSize == 0)
+            {
+                // do nothing before popping the callee-saved registers
+            }
+#ifdef _TARGET_X86_
+            else if (compiler->compLclFrameSize == REGSIZE_BYTES)
+            {
+                // "pop ecx" will make ESP point to the callee-saved registers
+                inst_RV(INS_pop, REG_ECX, TYP_I_IMPL);
+                regTracker.rsTrackRegTrash(REG_ECX);
+            }
+#endif // _TARGET_X86
+            else
+            {
+                // We need to make ESP point to the callee-saved registers
+                needLea = true;
+            }
+
+            if (needLea)
+            {
+                int offset;
+
+#ifdef _TARGET_AMD64_
+                // lea esp, [ebp + compiler->compLclFrameSize - genSPtoFPdelta]
+                //
+                // Case 1: localloc not used.
+                // genSPToFPDelta = compiler->compCalleeRegsPushed * REGSIZE_BYTES + compiler->compLclFrameSize
+                // offset = compiler->compCalleeRegsPushed * REGSIZE_BYTES;
+                // The amount to be subtracted from RBP to point at callee saved int regs.
+                //
+                // Case 2: localloc used
+                // genSPToFPDelta = Min(240, (int)compiler->lvaOutgoingArgSpaceSize)
+                // Offset = Amount to be aded to RBP to point at callee saved int regs.
+                offset = genSPtoFPdelta() - compiler->compLclFrameSize;
+
+                // Offset should fit within a byte if localloc is not used.
+                if (!compiler->compLocallocUsed)
+                {
+                    noway_assert(offset < UCHAR_MAX);
+                }
+#else
+                // lea esp, [ebp - compiler->compCalleeRegsPushed * REGSIZE_BYTES]
+                offset = compiler->compCalleeRegsPushed * REGSIZE_BYTES;
+                noway_assert(offset < UCHAR_MAX); // the offset fits in a byte
+#endif
+
+                getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -offset);
+            }
+        }
+
+        //
+        // Pop the callee-saved registers (if any)
+        //
+
+        genPopCalleeSavedRegisters();
+
+#ifdef _TARGET_AMD64_
+        assert(!needMovEspEbp); // "mov esp, ebp" is not allowed in AMD64 epilogs
+#else  // !_TARGET_AMD64_
+        if (needMovEspEbp)
+        {
+            // mov esp, ebp
+            inst_RV_RV(INS_mov, REG_SPBASE, REG_FPBASE);
+        }
+#endif // !_TARGET_AMD64_
+
+        // pop ebp
+        inst_RV(INS_pop, REG_EBP, TYP_I_IMPL);
+    }
+
+    getEmitter()->emitStartExitSeq(); // Mark the start of the "return" sequence
+
+    /* Check if this a special return block i.e.
+     * CEE_JMP instruction */
+
+    if (jmpEpilog)
+    {
+        noway_assert(block->bbJumpKind == BBJ_RETURN);
+        noway_assert(block->bbTreeList);
+
+        // figure out what jump we have
+        GenTree* jmpNode = block->lastNode();
+#if !FEATURE_FASTTAILCALL
+        // x86
+        noway_assert(jmpNode->gtOper == GT_JMP);
+#else
+        // amd64
+        // If jmpNode is GT_JMP then gtNext must be null.
+        // If jmpNode is a fast tail call, gtNext need not be null since it could have embedded stmts.
+        noway_assert((jmpNode->gtOper != GT_JMP) || (jmpNode->gtNext == nullptr));
+
+        // Could either be a "jmp method" or "fast tail call" implemented as epilog+jmp
+        noway_assert((jmpNode->gtOper == GT_JMP) ||
+                     ((jmpNode->gtOper == GT_CALL) && jmpNode->AsCall()->IsFastTailCall()));
+
+        // The next block is associated with this "if" stmt
+        if (jmpNode->gtOper == GT_JMP)
+#endif
+        {
+            // Simply emit a jump to the methodHnd. This is similar to a call so we can use
+            // the same descriptor with some minor adjustments.
+            CORINFO_METHOD_HANDLE methHnd = (CORINFO_METHOD_HANDLE)jmpNode->gtVal.gtVal1;
+
+            CORINFO_CONST_LOOKUP addrInfo;
+            compiler->info.compCompHnd->getFunctionEntryPoint(methHnd, &addrInfo);
+            if (addrInfo.accessType != IAT_VALUE && addrInfo.accessType != IAT_PVALUE)
+            {
+                NO_WAY("Unsupported JMP indirection");
+            }
+
+            const emitter::EmitCallType callType =
+                (addrInfo.accessType == IAT_VALUE) ? emitter::EC_FUNC_TOKEN : emitter::EC_FUNC_TOKEN_INDIR;
+
+            // Simply emit a jump to the methodHnd. This is similar to a call so we can use
+            // the same descriptor with some minor adjustments.
+            getEmitter()->emitIns_Call(callType, methHnd, INDEBUG_LDISASM_COMMA(nullptr) addrInfo.addr,
+                                       0,                                                      // argSize
+                                       EA_UNKNOWN                                              // retSize
+                                       FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(EA_UNKNOWN), // secondRetSize
+                                       gcInfo.gcVarPtrSetCur,
+                                       gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur, BAD_IL_OFFSET, REG_NA, REG_NA,
+                                       0, 0,  /* iloffset, ireg, xreg, xmul, disp */
+                                       true); /* isJump */
+        }
+#if FEATURE_FASTTAILCALL
+        else
+        {
+#ifdef _TARGET_AMD64_
+            // Fast tail call.
+            // Call target = RAX.
+            // Stack walker requires that a register indirect tail call be rex.w prefixed.
+            getEmitter()->emitIns_R(INS_rex_jmp, emitTypeSize(TYP_I_IMPL), REG_RAX);
+#else
+            assert(!"Fast tail call as epilog+jmp");
+            unreached();
+#endif //_TARGET_AMD64_
+        }
+#endif // FEATURE_FASTTAILCALL
+    }
+    else
+    {
+        unsigned stkArgSize = 0; // Zero on all platforms except x86
+
+#if defined(_TARGET_X86_)
+
+        noway_assert(compiler->compArgSize >= intRegState.rsCalleeRegArgCount * sizeof(void*));
+        stkArgSize = compiler->compArgSize - intRegState.rsCalleeRegArgCount * sizeof(void*);
+
+        noway_assert(compiler->compArgSize < 0x10000); // "ret" only has 2 byte operand
+
+        // varargs has caller pop
+        if (compiler->info.compIsVarArgs)
+            stkArgSize = 0;
+
+#endif // defined(_TARGET_X86_)
+
+        /* Return, popping our arguments (if any) */
+        instGen_Return(stkArgSize);
+    }
+}
+
+#else // _TARGET_*
+#error Unsupported or unset target architecture
+#endif // _TARGET_*
+
+#if FEATURE_EH_FUNCLETS
+
+#ifdef _TARGET_ARM_
+
+/*****************************************************************************
+ *
+ *  Generates code for an EH funclet prolog.
+ *
+ *  Funclets have the following incoming arguments:
+ *
+ *      catch:          r0 = the exception object that was caught (see GT_CATCH_ARG)
+ *      filter:         r0 = the exception object to filter (see GT_CATCH_ARG), r1 = CallerSP of the containing function
+ *      finally/fault:  none
+ *
+ *  Funclets set the following registers on exit:
+ *
+ *      catch:          r0 = the address at which execution should resume (see BBJ_EHCATCHRET)
+ *      filter:         r0 = non-zero if the handler should handle the exception, zero otherwise (see GT_RETFILT)
+ *      finally/fault:  none
+ *
+ *  The ARM funclet prolog sequence is:
+ *
+ *     push {regs,lr}   ; We push the callee-saved regs and 'lr'.
+ *                      ;   TODO-ARM-CQ: We probably only need to save lr, plus any callee-save registers that we
+ *                      ;         actually use in the funclet. Currently, we save the same set of callee-saved regs
+ *                      ;         calculated for the entire function.
+ *     sub sp, XXX      ; Establish the rest of the frame.
+ *                      ;   XXX is determined by lvaOutgoingArgSpaceSize plus space for the PSP slot, aligned
+ *                      ;   up to preserve stack alignment. If we push an odd number of registers, we also
+ *                      ;   generate this, to keep the stack aligned.
+ *
+ *     ; Fill the PSP slot, for use by the VM (it gets reported with the GC info), or by code generation of nested
+ *     ;     filters.
+ *     ; This is not part of the "OS prolog"; it has no associated unwind data, and is not reversed in the funclet
+ *     ;     epilog.
+ *
+ *     if (this is a filter funclet)
+ *     {
+ *          // r1 on entry to a filter funclet is CallerSP of the containing function:
+ *          // either the main function, or the funclet for a handler that this filter is dynamically nested within.
+ *          // Note that a filter can be dynamically nested within a funclet even if it is not statically within
+ *          // a funclet. Consider:
+ *          //
+ *          //    try {
+ *          //        try {
+ *          //            throw new Exception();
+ *          //        } catch(Exception) {
+ *          //            throw new Exception();     // The exception thrown here ...
+ *          //        }
+ *          //    } filter {                         // ... will be processed here, while the "catch" funclet frame is
+ *          //                                       // still on the stack
+ *          //    } filter-handler {
+ *          //    }
+ *          //
+ *          // Because of this, we need a PSP in the main function anytime a filter funclet doesn't know whether the
+ *          // enclosing frame will be a funclet or main function. We won't know any time there is a filter protecting
+ *          // nested EH. To simplify, we just always create a main function PSP for any function with a filter.
+ *
+ *          ldr r1, [r1 - PSP_slot_CallerSP_offset]     ; Load the CallerSP of the main function (stored in the PSP of
+ *                                                      ; the dynamically containing funclet or function)
+ *          str r1, [sp + PSP_slot_SP_offset]           ; store the PSP
+ *          sub r11, r1, Function_CallerSP_to_FP_delta  ; re-establish the frame pointer
+ *     }
+ *     else
+ *     {
+ *          // This is NOT a filter funclet. The VM re-establishes the frame pointer on entry.
+ *          // TODO-ARM-CQ: if VM set r1 to CallerSP on entry, like for filters, we could save an instruction.
+ *
+ *          add r3, r11, Function_CallerSP_to_FP_delta  ; compute the CallerSP, given the frame pointer. r3 is scratch.
+ *          str r3, [sp + PSP_slot_SP_offset]           ; store the PSP
+ *     }
+ *
+ *  The epilog sequence is then:
+ *
+ *     add sp, XXX      ; if necessary
+ *     pop {regs,pc}
+ *
+ *  If it is worth it, we could push r0, r1, r2, r3 instead of using an additional add/sub instruction.
+ *  Code size would be smaller, but we would be writing to / reading from the stack, which might be slow.
+ *
+ *  The funclet frame is thus:
+ *
+ *      |                       |
+ *      |-----------------------|
+ *      |       incoming        |
+ *      |       arguments       |
+ *      +=======================+ <---- Caller's SP
+ *      |Callee saved registers |
+ *      |-----------------------|
+ *      |Pre-spill regs space   |   // This is only necessary to keep the PSP slot at the same offset
+ *      |                       |   // in function and funclet
+ *      |-----------------------|
+ *      |        PSP slot       |
+ *      |-----------------------|
+ *      ~  possible 4 byte pad  ~
+ *      ~     for alignment     ~
+ *      |-----------------------|
+ *      |   Outgoing arg space  |
+ *      |-----------------------| <---- Ambient SP
+ *      |       |               |
+ *      ~       | Stack grows   ~
+ *      |       | downward      |
+ *              V
+ */
+
+void CodeGen::genFuncletProlog(BasicBlock* block)
+{
+#ifdef DEBUG
+    if (verbose)
+        printf("*************** In genFuncletProlog()\n");
+#endif
+
+    assert(block != NULL);
+    assert(block->bbFlags && BBF_FUNCLET_BEG);
+
+    ScopedSetVariable<bool> _setGeneratingProlog(&compiler->compGeneratingProlog, true);
+
+    gcInfo.gcResetForBB();
+
+    compiler->unwindBegProlog();
+
+    regMaskTP maskPushRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT;
+    regMaskTP maskPushRegsInt   = genFuncletInfo.fiSaveRegs & ~maskPushRegsFloat;
+
+    regMaskTP maskStackAlloc = genStackAllocRegisterMask(genFuncletInfo.fiSpDelta, maskPushRegsFloat);
+    maskPushRegsInt |= maskStackAlloc;
+
+    assert(FitsIn<int>(maskPushRegsInt));
+    inst_IV(INS_push, (int)maskPushRegsInt);
+    compiler->unwindPushMaskInt(maskPushRegsInt);
+
+    if (maskPushRegsFloat != RBM_NONE)
+    {
+        genPushFltRegs(maskPushRegsFloat);
+        compiler->unwindPushMaskFloat(maskPushRegsFloat);
+    }
+
+    bool isFilter = (block->bbCatchTyp == BBCT_FILTER);
+
+    regMaskTP maskArgRegsLiveIn;
+    if (isFilter)
+    {
+        maskArgRegsLiveIn = RBM_R0 | RBM_R1;
+    }
+    else if ((block->bbCatchTyp == BBCT_FINALLY) || (block->bbCatchTyp == BBCT_FAULT))
+    {
+        maskArgRegsLiveIn = RBM_NONE;
+    }
+    else
+    {
+        maskArgRegsLiveIn = RBM_R0;
+    }
+
+    regNumber initReg       = REG_R3; // R3 is never live on entry to a funclet, so it can be trashed
+    bool      initRegZeroed = false;
+
+    if (maskStackAlloc == RBM_NONE)
+    {
+        genAllocLclFrame(genFuncletInfo.fiSpDelta, initReg, &initRegZeroed, maskArgRegsLiveIn);
+    }
+
+    // This is the end of the OS-reported prolog for purposes of unwinding
+    compiler->unwindEndProlog();
+
+    if (isFilter)
+    {
+        // This is the first block of a filter
+
+        getEmitter()->emitIns_R_R_I(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_R1, REG_R1,
+                                    genFuncletInfo.fiPSP_slot_CallerSP_offset);
+        regTracker.rsTrackRegTrash(REG_R1);
+        getEmitter()->emitIns_R_R_I(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_R1, REG_SPBASE,
+                                    genFuncletInfo.fiPSP_slot_SP_offset);
+        getEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_FPBASE, REG_R1,
+                                    genFuncletInfo.fiFunctionCallerSPtoFPdelta);
+    }
+    else
+    {
+        // This is a non-filter funclet
+        getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, REG_R3, REG_FPBASE,
+                                    genFuncletInfo.fiFunctionCallerSPtoFPdelta);
+        regTracker.rsTrackRegTrash(REG_R3);
+        getEmitter()->emitIns_R_R_I(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_R3, REG_SPBASE,
+                                    genFuncletInfo.fiPSP_slot_SP_offset);
+    }
+}
+
+/*****************************************************************************
+ *
+ *  Generates code for an EH funclet epilog.
+ */
+
+void CodeGen::genFuncletEpilog()
+{
+#ifdef DEBUG
+    if (verbose)
+        printf("*************** In genFuncletEpilog()\n");
+#endif
+
+    ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
+
+    // Just as for the main function, we delay starting the unwind codes until we have
+    // an instruction which we know needs an unwind code. This is to support code like
+    // this:
+    //      movw    r3, 0x38e0
+    //      add     sp, r3
+    //      pop     {r4,r5,r6,r10,r11,pc}
+    // where the "movw" shouldn't be part of the unwind codes. See genFnEpilog() for more details.
+
+    bool unwindStarted = false;
+
+    /* The saved regs info saves the LR register. We need to pop the PC register to return */
+    assert(genFuncletInfo.fiSaveRegs & RBM_LR);
+
+    regMaskTP maskPopRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT;
+    regMaskTP maskPopRegsInt   = genFuncletInfo.fiSaveRegs & ~maskPopRegsFloat;
+
+    regMaskTP maskStackAlloc = genStackAllocRegisterMask(genFuncletInfo.fiSpDelta, maskPopRegsFloat);
+    maskPopRegsInt |= maskStackAlloc;
+
+    if (maskStackAlloc == RBM_NONE)
+    {
+        genFreeLclFrame(genFuncletInfo.fiSpDelta, &unwindStarted, false);
+    }
+
+    if (!unwindStarted)
+    {
+        // We'll definitely generate an unwindable instruction next
+        compiler->unwindBegEpilog();
+        unwindStarted = true;
+    }
+
+    maskPopRegsInt &= ~RBM_LR;
+    maskPopRegsInt |= RBM_PC;
+
+    if (maskPopRegsFloat != RBM_NONE)
+    {
+        genPopFltRegs(maskPopRegsFloat);
+        compiler->unwindPopMaskFloat(maskPopRegsFloat);
+    }
+
+    assert(FitsIn<int>(maskPopRegsInt));
+    inst_IV(INS_pop, (int)maskPopRegsInt);
+    compiler->unwindPopMaskInt(maskPopRegsInt);
+
+    compiler->unwindEndEpilog();
+}
+
+/*****************************************************************************
+ *
+ *  Capture the information used to generate the funclet prologs and epilogs.
+ *  Note that all funclet prologs are identical, and all funclet epilogs are
+ *  identical (per type: filters are identical, and non-filters are identical).
+ *  Thus, we compute the data used for these just once.
+ *
+ *  See genFuncletProlog() for more information about the prolog/epilog sequences.
+ */
+
+void CodeGen::genCaptureFuncletPrologEpilogInfo()
+{
+    if (compiler->ehAnyFunclets())
+    {
+        assert(isFramePointerUsed());
+        assert(compiler->lvaDoneFrameLayout ==
+               Compiler::FINAL_FRAME_LAYOUT); // The frame size and offsets must be finalized
+
+        // Frame pointer doesn't point at the end, it points at the pushed r11. So, instead
+        // of adding the number of callee-saved regs to CallerSP, we add 1 for lr and 1 for r11
+        // (plus the "pre spill regs"). Note that we assume r12 and r13 aren't saved
+        // (also assumed in genFnProlog()).
+        assert((regSet.rsMaskCalleeSaved & (RBM_R12 | RBM_R13)) == 0);
+        unsigned preSpillRegArgSize                = genCountBits(regSet.rsMaskPreSpillRegs(true)) * REGSIZE_BYTES;
+        genFuncletInfo.fiFunctionCallerSPtoFPdelta = preSpillRegArgSize + 2 * REGSIZE_BYTES;
+
+        regMaskTP rsMaskSaveRegs = regSet.rsMaskCalleeSaved;
+        unsigned  saveRegsCount  = genCountBits(rsMaskSaveRegs);
+        unsigned  saveRegsSize   = saveRegsCount * REGSIZE_BYTES; // bytes of regs we're saving
+        assert(compiler->lvaOutgoingArgSpaceSize % REGSIZE_BYTES == 0);
+        unsigned funcletFrameSize =
+            preSpillRegArgSize + saveRegsSize + REGSIZE_BYTES /* PSP slot */ + compiler->lvaOutgoingArgSpaceSize;
+
+        unsigned funcletFrameSizeAligned  = roundUp(funcletFrameSize, STACK_ALIGN);
+        unsigned funcletFrameAlignmentPad = funcletFrameSizeAligned - funcletFrameSize;
+        unsigned spDelta                  = funcletFrameSizeAligned - saveRegsSize;
+
+        unsigned PSP_slot_SP_offset = compiler->lvaOutgoingArgSpaceSize + funcletFrameAlignmentPad;
+        int      PSP_slot_CallerSP_offset =
+            -(int)(funcletFrameSize - compiler->lvaOutgoingArgSpaceSize); // NOTE: it's negative!
+
+        /* Now save it for future use */
+
+        genFuncletInfo.fiSaveRegs                 = rsMaskSaveRegs;
+        genFuncletInfo.fiSpDelta                  = spDelta;
+        genFuncletInfo.fiPSP_slot_SP_offset       = PSP_slot_SP_offset;
+        genFuncletInfo.fiPSP_slot_CallerSP_offset = PSP_slot_CallerSP_offset;
+
+#ifdef DEBUG
+        if (verbose)
+        {
+            printf("\n");
+            printf("Funclet prolog / epilog info\n");
+            printf("    Function CallerSP-to-FP delta: %d\n", genFuncletInfo.fiFunctionCallerSPtoFPdelta);
+            printf("                        Save regs: ");
+            dspRegMask(rsMaskSaveRegs);
+            printf("\n");
+            printf("                         SP delta: %d\n", genFuncletInfo.fiSpDelta);
+            printf("               PSP slot SP offset: %d\n", genFuncletInfo.fiPSP_slot_SP_offset);
+            printf("        PSP slot Caller SP offset: %d\n", genFuncletInfo.fiPSP_slot_CallerSP_offset);
+
+            if (PSP_slot_CallerSP_offset !=
+                compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)) // for debugging
+                printf("lvaGetCallerSPRelativeOffset(lvaPSPSym): %d\n",
+                       compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym));
+        }
+#endif // DEBUG
+
+        assert(PSP_slot_CallerSP_offset < 0);
+        assert(compiler->lvaPSPSym != BAD_VAR_NUM);
+        assert(PSP_slot_CallerSP_offset == compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)); // same offset
+                                                                                                         // used in main
+                                                                                                         // function and
+                                                                                                         // funclet!
+    }
+}
+
+#elif defined(_TARGET_AMD64_)
+
+/*****************************************************************************
+ *
+ *  Generates code for an EH funclet prolog.
+ *
+ *  Funclets have the following incoming arguments:
+ *
+ *      catch/filter-handler: rcx = InitialSP, rdx = the exception object that was caught (see GT_CATCH_ARG)
+ *      filter:               rcx = InitialSP, rdx = the exception object to filter (see GT_CATCH_ARG)
+ *      finally/fault:        rcx = InitialSP
+ *
+ *  Funclets set the following registers on exit:
+ *
+ *      catch/filter-handler: rax = the address at which execution should resume (see BBJ_EHCATCHRET)
+ *      filter:               rax = non-zero if the handler should handle the exception, zero otherwise (see GT_RETFILT)
+ *      finally/fault:        none
+ *
+ *  The AMD64 funclet prolog sequence is:
+ *
+ *     push ebp
+ *     push callee-saved regs
+ *                      ; TODO-AMD64-CQ: We probably only need to save any callee-save registers that we actually use
+ *                      ;         in the funclet. Currently, we save the same set of callee-saved regs calculated for
+ *                      ;         the entire function.
+ *     sub sp, XXX      ; Establish the rest of the frame.
+ *                      ;   XXX is determined by lvaOutgoingArgSpaceSize plus space for the PSP slot, aligned
+ *                      ;   up to preserve stack alignment. If we push an odd number of registers, we also
+ *                      ;   generate this, to keep the stack aligned.
+ *
+ *     ; Fill the PSP slot, for use by the VM (it gets reported with the GC info), or by code generation of nested
+ *     ;    filters.
+ *     ; This is not part of the "OS prolog"; it has no associated unwind data, and is not reversed in the funclet
+ *     ;    epilog.
+ *     ; Also, re-establish the frame pointer from the PSP.
+ *
+ *     mov rbp, [rcx + PSP_slot_InitialSP_offset]       ; Load the PSP (InitialSP of the main function stored in the
+ *                                                      ; PSP of the dynamically containing funclet or function)
+ *     mov [rsp + PSP_slot_InitialSP_offset], rbp       ; store the PSP in our frame
+ *     lea ebp, [rbp + Function_InitialSP_to_FP_delta]  ; re-establish the frame pointer of the parent frame. If
+ *                                                      ; Function_InitialSP_to_FP_delta==0, we don't need this
+ *                                                      ; instruction.
+ *
+ *  The epilog sequence is then:
+ *
+ *     add rsp, XXX
+ *     pop callee-saved regs    ; if necessary
+ *     pop rbp
+ *     ret
+ *
+ *  The funclet frame is thus:
+ *
+ *      |                       |
+ *      |-----------------------|
+ *      |       incoming        |
+ *      |       arguments       |
+ *      +=======================+ <---- Caller's SP
+ *      |    Return address     |
+ *      |-----------------------|
+ *      |      Saved EBP        |
+ *      |-----------------------|
+ *      |Callee saved registers |
+ *      |-----------------------|
+ *      ~  possible 8 byte pad  ~
+ *      ~     for alignment     ~
+ *      |-----------------------|
+ *      |        PSP slot       |
+ *      |-----------------------|
+ *      |   Outgoing arg space  | // this only exists if the function makes a call
+ *      |-----------------------| <---- Initial SP
+ *      |       |               |
+ *      ~       | Stack grows   ~
+ *      |       | downward      |
+ *              V
+ *
+ * TODO-AMD64-Bug?: the frame pointer should really point to the PSP slot (the debugger seems to assume this
+ * in DacDbiInterfaceImpl::InitParentFrameInfo()), or someplace above Initial-SP. There is an AMD64
+ * UNWIND_INFO restriction that it must be within 240 bytes of Initial-SP. See jit64\amd64\inc\md.h
+ * "FRAMEPTR OFFSETS" for details.
+ */
+
+void CodeGen::genFuncletProlog(BasicBlock* block)
+{
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** In genFuncletProlog()\n");
+    }
+#endif
+
+    assert(!regSet.rsRegsModified(RBM_FPBASE));
+    assert(block != nullptr);
+    assert(block->bbFlags & BBF_FUNCLET_BEG);
+    assert(isFramePointerUsed());
+
+    ScopedSetVariable<bool> _setGeneratingProlog(&compiler->compGeneratingProlog, true);
+
+    gcInfo.gcResetForBB();
+
+    compiler->unwindBegProlog();
+
+    // We need to push ebp, since it's callee-saved.
+    // We need to push the callee-saved registers. We only need to push the ones that we need, but we don't
+    // keep track of that on a per-funclet basis, so we push the same set as in the main function.
+    // The only fixed-size frame we need to allocate is whatever is big enough for the PSPSym, since nothing else
+    // is stored here (all temps are allocated in the parent frame).
+    // We do need to allocate the outgoing argument space, in case there are calls here. This must be the same
+    // size as the parent frame's outgoing argument space, to keep the PSPSym offset the same.
+
+    inst_RV(INS_push, REG_FPBASE, TYP_REF);
+    compiler->unwindPush(REG_FPBASE);
+
+    // Callee saved int registers are pushed to stack.
+    genPushCalleeSavedRegisters();
+
+    regMaskTP maskArgRegsLiveIn;
+    if ((block->bbCatchTyp == BBCT_FINALLY) || (block->bbCatchTyp == BBCT_FAULT))
+    {
+        maskArgRegsLiveIn = RBM_ARG_0;
+    }
+    else
+    {
+        maskArgRegsLiveIn = RBM_ARG_0 | RBM_ARG_2;
+    }
+
+    regNumber initReg       = REG_EBP; // We already saved EBP, so it can be trashed
+    bool      initRegZeroed = false;
+
+    genAllocLclFrame(genFuncletInfo.fiSpDelta, initReg, &initRegZeroed, maskArgRegsLiveIn);
+
+    // Callee saved float registers are copied to stack in their assigned stack slots
+    // after allocating space for them as part of funclet frame.
+    genPreserveCalleeSavedFltRegs(genFuncletInfo.fiSpDelta);
+
+    // This is the end of the OS-reported prolog for purposes of unwinding
+    compiler->unwindEndProlog();
+
+    getEmitter()->emitIns_R_AR(INS_mov, EA_PTRSIZE, REG_FPBASE, REG_ARG_0, genFuncletInfo.fiPSP_slot_InitialSP_offset);
+
+    regTracker.rsTrackRegTrash(REG_FPBASE);
+
+    getEmitter()->emitIns_AR_R(INS_mov, EA_PTRSIZE, REG_FPBASE, REG_SPBASE, genFuncletInfo.fiPSP_slot_InitialSP_offset);
+
+    if (genFuncletInfo.fiFunction_InitialSP_to_FP_delta != 0)
+    {
+        getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_FPBASE, REG_FPBASE,
+                                   genFuncletInfo.fiFunction_InitialSP_to_FP_delta);
+    }
+
+    // We've modified EBP, but not really. Say that we haven't...
+    regSet.rsRemoveRegsModified(RBM_FPBASE);
+}
+
+/*****************************************************************************
+ *
+ *  Generates code for an EH funclet epilog.
+ *
+ *  Note that we don't do anything with unwind codes, because AMD64 only cares about unwind codes for the prolog.
+ */
+
+void CodeGen::genFuncletEpilog()
+{
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** In genFuncletEpilog()\n");
+    }
+#endif
+
+    ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
+
+    // Restore callee saved XMM regs from their stack slots before modifying SP
+    // to position at callee saved int regs.
+    genRestoreCalleeSavedFltRegs(genFuncletInfo.fiSpDelta);
+    inst_RV_IV(INS_add, REG_SPBASE, genFuncletInfo.fiSpDelta, EA_PTRSIZE);
+    genPopCalleeSavedRegisters();
+    inst_RV(INS_pop, REG_EBP, TYP_I_IMPL);
+    instGen_Return(0);
+}
+
+/*****************************************************************************
+ *
+ *  Capture the information used to generate the funclet prologs and epilogs.
+ */
+
+void CodeGen::genCaptureFuncletPrologEpilogInfo()
+{
+    if (!compiler->ehAnyFunclets())
+    {
+        return;
+    }
+
+    // Note that compLclFrameSize can't be used (for can we call functions that depend on it),
+    // because we're not going to allocate the same size frame as the parent.
+
+    assert(isFramePointerUsed());
+    assert(compiler->lvaDoneFrameLayout ==
+           Compiler::FINAL_FRAME_LAYOUT);                         // The frame size and offsets must be finalized
+    assert(compiler->compCalleeFPRegsSavedMask != (regMaskTP)-1); // The float registers to be preserved is finalized
+
+    // Even though lvaToInitialSPRelativeOffset() depends on compLclFrameSize,
+    // that's ok, because we're figuring out an offset in the parent frame.
+    genFuncletInfo.fiFunction_InitialSP_to_FP_delta =
+        compiler->lvaToInitialSPRelativeOffset(0, true); // trick to find the Initial-SP-relative offset of the frame
+                                                         // pointer.
+
+    assert(compiler->lvaOutgoingArgSpaceSize % REGSIZE_BYTES == 0);
+#ifndef UNIX_AMD64_ABI
+    // No 4 slots for outgoing params on the stack for System V systems.
+    assert((compiler->lvaOutgoingArgSpaceSize == 0) ||
+           (compiler->lvaOutgoingArgSpaceSize >= (4 * REGSIZE_BYTES))); // On AMD64, we always have 4 outgoing argument
+// slots if there are any calls in the function.
+#endif // UNIX_AMD64_ABI
+    unsigned offset = compiler->lvaOutgoingArgSpaceSize;
+
+    genFuncletInfo.fiPSP_slot_InitialSP_offset = offset;
+
+    // How much stack do we allocate in the funclet?
+    // We need to 16-byte align the stack.
+
+    unsigned totalFrameSize =
+        REGSIZE_BYTES                                       // return address
+        + REGSIZE_BYTES                                     // pushed EBP
+        + (compiler->compCalleeRegsPushed * REGSIZE_BYTES); // pushed callee-saved int regs, not including EBP
+
+    // Entire 128-bits of XMM register is saved to stack due to ABI encoding requirement.
+    // Copying entire XMM register to/from memory will be performant if SP is aligned at XMM_REGSIZE_BYTES boundary.
+    unsigned calleeFPRegsSavedSize = genCountBits(compiler->compCalleeFPRegsSavedMask) * XMM_REGSIZE_BYTES;
+    unsigned FPRegsPad             = (calleeFPRegsSavedSize > 0) ? AlignmentPad(totalFrameSize, XMM_REGSIZE_BYTES) : 0;
+
+    totalFrameSize += FPRegsPad               // Padding before pushing entire xmm regs
+                      + calleeFPRegsSavedSize // pushed callee-saved float regs
+                      // below calculated 'pad' will go here
+                      + REGSIZE_BYTES                     // PSPSym
+                      + compiler->lvaOutgoingArgSpaceSize // outgoing arg space
+        ;
+
+    unsigned pad = AlignmentPad(totalFrameSize, 16);
+
+    genFuncletInfo.fiSpDelta = FPRegsPad                           // Padding to align SP on XMM_REGSIZE_BYTES boundary
+                               + calleeFPRegsSavedSize             // Callee saved xmm regs
+                               + pad + REGSIZE_BYTES               // PSPSym
+                               + compiler->lvaOutgoingArgSpaceSize // outgoing arg space
+        ;
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("\n");
+        printf("Funclet prolog / epilog info\n");
+        printf("   Function InitialSP-to-FP delta: %d\n", genFuncletInfo.fiFunction_InitialSP_to_FP_delta);
+        printf("                         SP delta: %d\n", genFuncletInfo.fiSpDelta);
+        printf("       PSP slot Initial SP offset: %d\n", genFuncletInfo.fiPSP_slot_InitialSP_offset);
+    }
+#endif // DEBUG
+
+    assert(compiler->lvaPSPSym != BAD_VAR_NUM);
+    assert(genFuncletInfo.fiPSP_slot_InitialSP_offset ==
+           compiler->lvaGetInitialSPRelativeOffset(compiler->lvaPSPSym)); // same offset used in main function and
+                                                                          // funclet!
+}
+
+#elif defined(_TARGET_ARM64_)
+
+// Look in CodeGenArm64.cpp
+
+#else // _TARGET_*
+
+/*****************************************************************************
+ *
+ *  Generates code for an EH funclet prolog.
+ */
+
+void CodeGen::genFuncletProlog(BasicBlock* block)
+{
+    NYI("Funclet prolog");
+}
+
+/*****************************************************************************
+ *
+ *  Generates code for an EH funclet epilog.
+ */
+
+void CodeGen::genFuncletEpilog()
+{
+    NYI("Funclet epilog");
+}
+
+/*****************************************************************************
+ *
+ *  Capture the information used to generate the funclet prologs and epilogs.
+ */
+
+void CodeGen::genCaptureFuncletPrologEpilogInfo()
+{
+    if (compiler->ehAnyFunclets())
+    {
+        NYI("genCaptureFuncletPrologEpilogInfo()");
+    }
+}
+
+#endif // _TARGET_*
+
+/*-----------------------------------------------------------------------------
+ *
+ *  Set the main function PSPSym value in the frame.
+ *  Funclets use different code to load the PSP sym and save it in their frame.
+ *  See the document "X64 and ARM ABIs.docx" for a full description of the PSPSym.
+ *  The PSPSym section of that document is copied here.
+ *
+ ***********************************
+ *  The name PSPSym stands for Previous Stack Pointer Symbol.  It is how a funclet
+ *  accesses locals from the main function body.
+ *
+ *  First, two definitions.
+ *
+ *  Caller-SP is the value of the stack pointer in a function's caller before the call
+ *  instruction is executed. That is, when function A calls function B, Caller-SP for B
+ *  is the value of the stack pointer immediately before the call instruction in A
+ *  (calling B) was executed. Note that this definition holds for both AMD64, which
+ *  pushes the return value when a call instruction is executed, and for ARM, which
+ *  doesn't. For AMD64, Caller-SP is the address above the call return address.
+ *
+ *  Initial-SP is the initial value of the stack pointer after the fixed-size portion of
+ *  the frame has been allocated. That is, before any "alloca"-type allocations.
+ *
+ *  The PSPSym is a pointer-sized local variable in the frame of the main function and
+ *  of each funclet. The value stored in PSPSym is the value of Initial-SP/Caller-SP
+ *  for the main function.  The stack offset of the PSPSym is reported to the VM in the
+ *  GC information header.  The value reported in the GC information is the offset of the
+ *  PSPSym from Initial-SP/Caller-SP. (Note that both the value stored, and the way the
+ *  value is reported to the VM, differs between architectures. In particular, note that
+ *  most things in the GC information header are reported as offsets relative to Caller-SP,
+ *  but PSPSym on AMD64 is one (maybe the only) exception.)
+ *
+ *  The VM uses the PSPSym to find other locals it cares about (such as the generics context
+ *  in a funclet frame). The JIT uses it to re-establish the frame pointer register, so that
+ *  the frame pointer is the same value in a funclet as it is in the main function body.
+ *
+ *  When a funclet is called, it is passed the Establisher Frame Pointer. For AMD64 this is
+ *  true for all funclets and it is passed as the first argument in RCX, but for ARM this is
+ *  only true for first pass funclets (currently just filters) and it is passed as the second
+ *  argument in R1. The Establisher Frame Pointer is a stack pointer of an interesting "parent"
+ *  frame in the exception processing system. For the CLR, it points either to the main function
+ *  frame or a dynamically enclosing funclet frame from the same function, for the funclet being
+ *  invoked. The value of the Establisher Frame Pointer is Initial-SP on AMD64, Caller-SP on ARM.
+ *
+ *  Using the establisher frame, the funclet wants to load the value of the PSPSym. Since we
+ *  don't know if the Establisher Frame is from the main function or a funclet, we design the
+ *  main function and funclet frame layouts to place the PSPSym at an identical, small, constant
+ *  offset from the Establisher Frame in each case. (This is also required because we only report
+ *  a single offset to the PSPSym in the GC information, and that offset must be valid for the main
+ *  function and all of its funclets). Then, the funclet uses this known offset to compute the
+ *  PSPSym address and read its value. From this, it can compute the value of the frame pointer
+ *  (which is a constant offset from the PSPSym value) and set the frame register to be the same
+ *  as the parent function. Also, the funclet writes the value of the PSPSym to its own frame's
+ *  PSPSym. This "copying" of the PSPSym happens for every funclet invocation, in particular,
+ *  for every nested funclet invocation.
+ *
+ *  On ARM, for all second pass funclets (finally, fault, catch, and filter-handler) the VM
+ *  restores all non-volatile registers to their values within the parent frame. This includes
+ *  the frame register (R11). Thus, the PSPSym is not used to recompute the frame pointer register
+ *  in this case, though the PSPSym is copied to the funclet's frame, as for all funclets.
+ *
+ *  Catch, Filter, and Filter-handlers also get an Exception object (GC ref) as an argument
+ *  (REG_EXCEPTION_OBJECT).  On AMD64 it is the second argument and thus passed in RDX.  On
+ *  ARM this is the first argument and passed in R0.
+ *
+ *  (Note that the JIT64 source code contains a comment that says, "The current CLR doesn't always
+ *  pass the correct establisher frame to the funclet. Funclet may receive establisher frame of
+ *  funclet when expecting that of original routine." It indicates this is the reason that a PSPSym
+ *  is required in all funclets as well as the main function, whereas if the establisher frame was
+ *  correctly reported, the PSPSym could be omitted in some cases.)
+ ***********************************
+ */
+void CodeGen::genSetPSPSym(regNumber initReg, bool* pInitRegZeroed)
+{
+    assert(compiler->compGeneratingProlog);
+
+    if (!compiler->ehNeedsPSPSym())
+    {
+        return;
+    }
+
+    noway_assert(isFramePointerUsed());         // We need an explicit frame pointer
+    assert(compiler->lvaPSPSym != BAD_VAR_NUM); // We should have created the PSPSym variable
+
+#if defined(_TARGET_ARM_)
+
+    // We either generate:
+    //     add     r1, r11, 8
+    //     str     r1, [reg + PSPSymOffset]
+    // or:
+    //     add     r1, sp, 76
+    //     str     r1, [reg + PSPSymOffset]
+    // depending on the smallest encoding
+
+    int SPtoCallerSPdelta = -genCallerSPtoInitialSPdelta();
+
+    int       callerSPOffs;
+    regNumber regBase;
+
+    if (arm_Valid_Imm_For_Add_SP(SPtoCallerSPdelta))
+    {
+        // use the "add <reg>, sp, imm" form
+
+        callerSPOffs = SPtoCallerSPdelta;
+        regBase      = REG_SPBASE;
+    }
+    else
+    {
+        // use the "add <reg>, r11, imm" form
+
+        int FPtoCallerSPdelta = -genCallerSPtoFPdelta();
+        noway_assert(arm_Valid_Imm_For_Add(FPtoCallerSPdelta, INS_FLAGS_DONT_CARE));
+
+        callerSPOffs = FPtoCallerSPdelta;
+        regBase      = REG_FPBASE;
+    }
+
+    // We will just use the initReg since it is an available register
+    // and we are probably done using it anyway...
+    regNumber regTmp = initReg;
+    *pInitRegZeroed  = false;
+
+    getEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, regTmp, regBase, callerSPOffs);
+    getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, regTmp, compiler->lvaPSPSym, 0);
+
+#elif defined(_TARGET_ARM64_)
+
+    int SPtoCallerSPdelta = -genCallerSPtoInitialSPdelta();
+
+    // We will just use the initReg since it is an available register
+    // and we are probably done using it anyway...
+    regNumber regTmp = initReg;
+    *pInitRegZeroed  = false;
+
+    getEmitter()->emitIns_R_R_Imm(INS_add, EA_PTRSIZE, regTmp, REG_SPBASE, SPtoCallerSPdelta);
+    getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, regTmp, compiler->lvaPSPSym, 0);
+
+#elif defined(_TARGET_AMD64_)
+
+    // The PSP sym value is Initial-SP, not Caller-SP!
+    // We assume that RSP is Initial-SP when this function is called. That is, the stack frame
+    // has been established.
+    //
+    // We generate:
+    //     mov     [rbp-20h], rsp       // store the Initial-SP (our current rsp) in the PSPsym
+
+    getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaPSPSym, 0);
+
+#else // _TARGET_*
+
+    NYI("Set function PSP sym");
+
+#endif // _TARGET_*
+}
+
+#endif // FEATURE_EH_FUNCLETS
+
+/*****************************************************************************
+ *
+ *  Generates code for all the function and funclet prologs and epilogs.
+ */
+
+void CodeGen::genGeneratePrologsAndEpilogs()
+{
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** Before prolog / epilog generation\n");
+        getEmitter()->emitDispIGlist(false);
+    }
+#endif
+
+#ifndef LEGACY_BACKEND
+    // Before generating the prolog, we need to reset the variable locations to what they will be on entry.
+    // This affects our code that determines which untracked locals need to be zero initialized.
+    compiler->m_pLinearScan->recordVarLocationsAtStartOfBB(compiler->fgFirstBB);
+#endif // !LEGACY_BACKEND
+
+    // Tell the emitter we're done with main code generation, and are going to start prolog and epilog generation.
+
+    getEmitter()->emitStartPrologEpilogGeneration();
+
+    gcInfo.gcResetForBB();
+    genFnProlog();
+
+    // Generate all the prologs and epilogs.
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#if FEATURE_EH_FUNCLETS
+
+    // Capture the data we're going to use in the funclet prolog and epilog generation. This is
+    // information computed during codegen, or during function prolog generation, like
+    // frame offsets. It must run after main function prolog generation.
+
+    genCaptureFuncletPrologEpilogInfo();
+
+#endif // FEATURE_EH_FUNCLETS
+
+    // Walk the list of prologs and epilogs and generate them.
+    // We maintain a list of prolog and epilog basic blocks in
+    // the insGroup structure in the emitter. This list was created
+    // during code generation by the genReserve*() functions.
+    //
+    // TODO: it seems like better design would be to create a list of prologs/epilogs
+    // in the code generator (not the emitter), and then walk that list. But we already
+    // have the insGroup list, which serves well, so we don't need the extra allocations
+    // for a prolog/epilog list in the code generator.
+
+    getEmitter()->emitGeneratePrologEpilog();
+
+    // Tell the emitter we're done with all prolog and epilog generation.
+
+    getEmitter()->emitFinishPrologEpilogGeneration();
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** After prolog / epilog generation\n");
+        getEmitter()->emitDispIGlist(false);
+    }
+#endif
+}
+
+/*
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                           End Prolog / Epilog                             XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+#if STACK_PROBES
+void CodeGen::genGenerateStackProbe()
+{
+    noway_assert(compiler->opts.compNeedStackProbes);
+
+    // If this assert fires, it means somebody has changed the value
+    // CORINFO_STACKPROBE_DEPTH.
+    // Why does the EE need such a deep probe? It should just need a couple
+    // of bytes, to set up a frame in the unmanaged code..
+
+    static_assert_no_msg(CORINFO_STACKPROBE_DEPTH + JIT_RESERVED_STACK < compiler->eeGetPageSize());
+
+    JITDUMP("Emitting stack probe:\n");
+    getEmitter()->emitIns_AR_R(INS_TEST, EA_PTRSIZE, REG_EAX, REG_SPBASE,
+                               -(CORINFO_STACKPROBE_DEPTH + JIT_RESERVED_STACK));
+}
+#endif // STACK_PROBES
+
+/*****************************************************************************
+ *
+ *  Record the constant and return a tree node that yields its address.
+ */
+
+GenTreePtr CodeGen::genMakeConst(const void* cnsAddr, var_types cnsType, GenTreePtr cnsTree, bool dblAlign)
+{
+    // Assign the constant an offset in the data section
+    UNATIVE_OFFSET cnsSize = genTypeSize(cnsType);
+    UNATIVE_OFFSET cnum    = getEmitter()->emitDataConst(cnsAddr, cnsSize, dblAlign);
+
+#ifdef DEBUG
+    if (compiler->opts.dspCode)
+    {
+        printf("   @%s%02u   ", "CNS", cnum);
+
+        switch (cnsType)
+        {
+            case TYP_INT:
+                printf("DD      %d \n", *(int*)cnsAddr);
+                break;
+            case TYP_LONG:
+                printf("DQ      %lld\n", *(__int64*)cnsAddr);
+                break;
+            case TYP_FLOAT:
+                printf("DF      %f \n", *(float*)cnsAddr);
+                break;
+            case TYP_DOUBLE:
+                printf("DQ      %lf\n", *(double*)cnsAddr);
+                break;
+
+            default:
+                noway_assert(!"unexpected constant type");
+        }
+    }
+#endif
+
+    // Access to inline data is 'abstracted' by a special type of static member
+    // (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
+    // to constant data, not a real static field.
+
+    return new (compiler, GT_CLS_VAR) GenTreeClsVar(cnsType, compiler->eeFindJitDataOffs(cnum), nullptr);
+}
+
+#if defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+// Save compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
+// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
+// Here offset = 16-byte aligned offset after pushing integer registers.
+//
+// Params
+//   lclFrameSize - Fixed frame size excluding callee pushed int regs.
+//             non-funclet: this will be compLclFrameSize.
+//             funclet frames: this will be FuncletInfo.fiSpDelta.
+void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
+{
+    regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
+
+    // Only callee saved floating point registers should be in regMask
+    assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
+
+    // fast path return
+    if (regMask == RBM_NONE)
+    {
+        return;
+    }
+
+#ifdef _TARGET_AMD64_
+    unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
+    unsigned offset            = lclFrameSize - firstFPRegPadding - XMM_REGSIZE_BYTES;
+
+    // Offset is 16-byte aligned since we use movaps for preserving xmm regs.
+    assert((offset % 16) == 0);
+    instruction copyIns = ins_Copy(TYP_FLOAT);
+#else  // !_TARGET_AMD64_
+    unsigned    offset            = lclFrameSize - XMM_REGSIZE_BYTES;
+    instruction copyIns           = INS_movupd;
+#endif // !_TARGET_AMD64_
+
+    for (regNumber reg = REG_FLT_CALLEE_SAVED_FIRST; regMask != RBM_NONE; reg = REG_NEXT(reg))
+    {
+        regMaskTP regBit = genRegMask(reg);
+        if ((regBit & regMask) != 0)
+        {
+            // ABI requires us to preserve lower 128-bits of YMM register.
+            getEmitter()->emitIns_AR_R(copyIns,
+                                       EA_8BYTE, // TODO-XArch-Cleanup: size specified here doesn't matter but should be
+                                                 // EA_16BYTE
+                                       reg, REG_SPBASE, offset);
+            compiler->unwindSaveReg(reg, offset);
+            regMask &= ~regBit;
+            offset -= XMM_REGSIZE_BYTES;
+        }
+    }
+
+#ifdef FEATURE_AVX_SUPPORT
+    // Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
+    // This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
+    // using SSE2.
+    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
+    {
+        instGen(INS_vzeroupper);
+    }
+#endif
+}
+
+// Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
+// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
+// Here offset = 16-byte aligned offset after pushing integer registers.
+//
+// Params
+//   lclFrameSize - Fixed frame size excluding callee pushed int regs.
+//             non-funclet: this will be compLclFrameSize.
+//             funclet frames: this will be FuncletInfo.fiSpDelta.
+void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
+{
+    regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
+
+    // Only callee saved floating point registers should be in regMask
+    assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
+
+    // fast path return
+    if (regMask == RBM_NONE)
+    {
+        return;
+    }
+
+#ifdef _TARGET_AMD64_
+    unsigned    firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
+    instruction copyIns           = ins_Copy(TYP_FLOAT);
+#else  // !_TARGET_AMD64_
+    unsigned    firstFPRegPadding = 0;
+    instruction copyIns           = INS_movupd;
+#endif // !_TARGET_AMD64_
+
+    unsigned  offset;
+    regNumber regBase;
+    if (compiler->compLocallocUsed)
+    {
+        // localloc frame: use frame pointer relative offset
+        assert(isFramePointerUsed());
+        regBase = REG_FPBASE;
+        offset  = lclFrameSize - genSPtoFPdelta() - firstFPRegPadding - XMM_REGSIZE_BYTES;
+    }
+    else
+    {
+        regBase = REG_SPBASE;
+        offset  = lclFrameSize - firstFPRegPadding - XMM_REGSIZE_BYTES;
+    }
+
+#ifdef _TARGET_AMD64_
+    // Offset is 16-byte aligned since we use movaps for restoring xmm regs
+    assert((offset % 16) == 0);
+#endif // _TARGET_AMD64_
+
+#ifdef FEATURE_AVX_SUPPORT
+    // Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
+    // This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
+    // using SSE2.
+    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
+    {
+        instGen(INS_vzeroupper);
+    }
+#endif
+
+    for (regNumber reg = REG_FLT_CALLEE_SAVED_FIRST; regMask != RBM_NONE; reg = REG_NEXT(reg))
+    {
+        regMaskTP regBit = genRegMask(reg);
+        if ((regBit & regMask) != 0)
+        {
+            // ABI requires us to restore lower 128-bits of YMM register.
+            getEmitter()->emitIns_R_AR(copyIns,
+                                       EA_8BYTE, // TODO-XArch-Cleanup: size specified here doesn't matter but should be
+                                                 // EA_16BYTE
+                                       reg, regBase, offset);
+            regMask &= ~regBit;
+            offset -= XMM_REGSIZE_BYTES;
+        }
+    }
+}
+#endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
+
+//-----------------------------------------------------------------------------------
+// IsMultiRegPassedType: Returns true if the type is returned in multiple registers
+//
+// Arguments:
+//     hClass   -  type handle
+//
+// Return Value:
+//     true if type is passed in multiple registers, false otherwise.
+//
+bool Compiler::IsMultiRegPassedType(CORINFO_CLASS_HANDLE hClass)
+{
+    if (hClass == NO_CLASS_HANDLE)
+    {
+        return false;
+    }
+
+    structPassingKind howToPassStruct;
+    var_types         returnType = getArgTypeForStruct(hClass, &howToPassStruct);
+
+    return (returnType == TYP_STRUCT);
+}
+
+//-----------------------------------------------------------------------------------
+// IsMultiRegReturnedType: Returns true if the type is returned in multiple registers
+//
+// Arguments:
+//     hClass   -  type handle
+//
+// Return Value:
+//     true if type is returned in multiple registers, false otherwise.
+//
+bool Compiler::IsMultiRegReturnedType(CORINFO_CLASS_HANDLE hClass)
+{
+    if (hClass == NO_CLASS_HANDLE)
+    {
+        return false;
+    }
+
+    structPassingKind howToReturnStruct;
+    var_types         returnType = getReturnTypeForStruct(hClass, &howToReturnStruct);
+
+    return (returnType == TYP_STRUCT);
+}
+
+//----------------------------------------------
+// Methods that support HFA's for ARM32/ARM64
+//----------------------------------------------
+
+bool Compiler::IsHfa(CORINFO_CLASS_HANDLE hClass)
+{
+#ifdef FEATURE_HFA
+    return varTypeIsFloating(GetHfaType(hClass));
+#else
+    return false;
+#endif
+}
+
+bool Compiler::IsHfa(GenTreePtr tree)
+{
+#ifdef FEATURE_HFA
+    return IsHfa(gtGetStructHandleIfPresent(tree));
+#else
+    return false;
+#endif
+}
+
+var_types Compiler::GetHfaType(GenTreePtr tree)
+{
+#ifdef FEATURE_HFA
+    if (tree->TypeGet() == TYP_STRUCT)
+    {
+        return GetHfaType(gtGetStructHandleIfPresent(tree));
+    }
+#endif
+    return TYP_UNDEF;
+}
+
+unsigned Compiler::GetHfaCount(GenTreePtr tree)
+{
+    return GetHfaCount(gtGetStructHandleIfPresent(tree));
+}
+
+var_types Compiler::GetHfaType(CORINFO_CLASS_HANDLE hClass)
+{
+    var_types result = TYP_UNDEF;
+    if (hClass != NO_CLASS_HANDLE)
+    {
+#ifdef FEATURE_HFA
+        CorInfoType corType = info.compCompHnd->getHFAType(hClass);
+        if (corType != CORINFO_TYPE_UNDEF)
+        {
+            result = JITtype2varType(corType);
+        }
+#endif // FEATURE_HFA
+    }
+    return result;
+}
+
+//------------------------------------------------------------------------
+// GetHfaCount: Given a  class handle for an HFA struct
+//    return the number of registers needed to hold the HFA
+//
+//    Note that on ARM32 the single precision registers overlap with
+//        the double precision registers and for that reason each
+//        double register is considered to be two single registers.
+//        Thus for ARM32 an HFA of 4 doubles this function will return 8.
+//    On ARM64 given an HFA of 4 singles or 4 doubles this function will
+//         will return 4 for both.
+// Arguments:
+//    hClass: the class handle of a HFA struct
+//
+unsigned Compiler::GetHfaCount(CORINFO_CLASS_HANDLE hClass)
+{
+    assert(IsHfa(hClass));
+#ifdef _TARGET_ARM_
+    // A HFA of doubles is twice as large as an HFA of singles for ARM32
+    // (i.e. uses twice the number of single precison registers)
+    return info.compCompHnd->getClassSize(hClass) / REGSIZE_BYTES;
+#else  // _TARGET_ARM64_
+    var_types hfaType   = GetHfaType(hClass);
+    unsigned  classSize = info.compCompHnd->getClassSize(hClass);
+    // Note that the retail build issues a warning about a potential divsion by zero without the Max function
+    unsigned elemSize = Max((unsigned)1, EA_SIZE_IN_BYTES(emitActualTypeSize(hfaType)));
+    return classSize / elemSize;
+#endif // _TARGET_ARM64_
+}
+
+#ifdef _TARGET_XARCH_
+
+//------------------------------------------------------------------------
+// genMapShiftInsToShiftByConstantIns: Given a general shift/rotate instruction,
+// map it to the specific x86/x64 shift opcode for a shift/rotate by a constant.
+// X86/x64 has a special encoding for shift/rotate-by-constant-1.
+//
+// Arguments:
+//    ins: the base shift/rotate instruction
+//    shiftByValue: the constant value by which we are shifting/rotating
+//
+instruction CodeGen::genMapShiftInsToShiftByConstantIns(instruction ins, int shiftByValue)
+{
+    assert(ins == INS_rcl || ins == INS_rcr || ins == INS_rol || ins == INS_ror || ins == INS_shl || ins == INS_shr ||
+           ins == INS_sar);
+
+    // Which format should we use?
+
+    instruction shiftByConstantIns;
+
+    if (shiftByValue == 1)
+    {
+        // Use the shift-by-one format.
+
+        assert(INS_rcl + 1 == INS_rcl_1);
+        assert(INS_rcr + 1 == INS_rcr_1);
+        assert(INS_rol + 1 == INS_rol_1);
+        assert(INS_ror + 1 == INS_ror_1);
+        assert(INS_shl + 1 == INS_shl_1);
+        assert(INS_shr + 1 == INS_shr_1);
+        assert(INS_sar + 1 == INS_sar_1);
+
+        shiftByConstantIns = (instruction)(ins + 1);
+    }
+    else
+    {
+        // Use the shift-by-NNN format.
+
+        assert(INS_rcl + 2 == INS_rcl_N);
+        assert(INS_rcr + 2 == INS_rcr_N);
+        assert(INS_rol + 2 == INS_rol_N);
+        assert(INS_ror + 2 == INS_ror_N);
+        assert(INS_shl + 2 == INS_shl_N);
+        assert(INS_shr + 2 == INS_shr_N);
+        assert(INS_sar + 2 == INS_sar_N);
+
+        shiftByConstantIns = (instruction)(ins + 2);
+    }
+
+    return shiftByConstantIns;
+}
+
+#endif // _TARGET_XARCH_
+
+#if !defined(LEGACY_BACKEND) && (defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_))
+
+//------------------------------------------------------------------------------------------------ //
+// getFirstArgWithStackSlot - returns the first argument with stack slot on the caller's frame.
+//
+// Return value:
+//    The number of the first argument with stack slot on the caller's frame.
+//
+// Note:
+//    On x64 Windows the caller always creates slots (homing space) in its frame for the
+//    first 4 arguments of a callee (register passed args). So, the the variable number
+//    (lclNum) for the first argument with a stack slot is always 0.
+//    For System V systems or arm64, there is no such calling convention requirement, and the code needs to find
+//    the first stack passed argument from the caller. This is done by iterating over
+//    all the lvParam variables and finding the first with lvArgReg equals to REG_STK.
+//
+unsigned CodeGen::getFirstArgWithStackSlot()
+{
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) || defined(_TARGET_ARM64_)
+    unsigned baseVarNum = 0;
+#if defined(FEATURE_UNIX_AMR64_STRUCT_PASSING)
+    baseVarNum = compiler->lvaFirstStackIncomingArgNum;
+
+    if (compiler->lvaFirstStackIncomingArgNum != BAD_VAR_NUM)
+    {
+        baseVarNum = compiler->lvaFirstStackIncomingArgNum;
+    }
+    else
+#endif // FEATURE_UNIX_ARM64_STRUCT_PASSING
+    {
+        // Iterate over all the local variables in the Lcl var table.
+        // They contain all the implicit arguments - thisPtr, retBuf,
+        // generic context, PInvoke cookie, var arg cookie,no-standard args, etc.
+        LclVarDsc* varDsc = nullptr;
+        for (unsigned i = 0; i < compiler->info.compArgsCount; i++)
+        {
+            varDsc = &(compiler->lvaTable[i]);
+
+            // We are iterating over the arguments only.
+            assert(varDsc->lvIsParam);
+
+            if (varDsc->lvArgReg == REG_STK)
+            {
+                baseVarNum = i;
+#if defined(FEATURE_UNIX_AMR64_STRUCT_PASSING)
+                compiler->lvaFirstStackIncomingArgNum = baseVarNum;
+#endif // FEATURE_UNIX_ARM64_STRUCT_PASSING
+                break;
+            }
+        }
+        assert(varDsc != nullptr);
+    }
+
+    return baseVarNum;
+#elif defined(_TARGET_AMD64_)
+    return 0;
+#else
+    // Not implemented for x86.
+    NYI_X86("getFirstArgWithStackSlot not yet implemented for x86.");
+    return BAD_VAR_NUM;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING || _TARGET_ARM64_
+}
+
+#endif // !LEGACY_BACKEND && (_TARGET_XARCH_ || _TARGET_ARM64_)
+
+/*****************************************************************************/
+#ifdef DEBUGGING_SUPPORT
+
+/*****************************************************************************
+ *                          genSetScopeInfo
+ *
+ * This function should be called only after the sizes of the emitter blocks
+ * have been finalized.
+ */
+
+void CodeGen::genSetScopeInfo()
+{
+    if (!compiler->opts.compScopeInfo)
+    {
+        return;
+    }
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** In genSetScopeInfo()\n");
+    }
+#endif
+
+    if (compiler->info.compVarScopesCount == 0)
+    {
+        compiler->eeSetLVcount(0);
+        compiler->eeSetLVdone();
+        return;
+    }
+
+    noway_assert(compiler->opts.compScopeInfo && (compiler->info.compVarScopesCount > 0));
+    noway_assert(psiOpenScopeList.scNext == nullptr);
+
+    unsigned i;
+    unsigned scopeCnt = siScopeCnt + psiScopeCnt;
+
+    compiler->eeSetLVcount(scopeCnt);
+
+#ifdef DEBUG
+    genTrnslLocalVarCount = scopeCnt;
+    if (scopeCnt)
+    {
+        genTrnslLocalVarInfo = new (compiler, CMK_DebugOnly) TrnslLocalVarInfo[scopeCnt];
+    }
+#endif
+
+    // Record the scopes found for the parameters over the prolog.
+    // The prolog needs to be treated differently as a variable may not
+    // have the same info in the prolog block as is given by compiler->lvaTable.
+    // eg. A register parameter is actually on the stack, before it is loaded to reg.
+
+    CodeGen::psiScope* scopeP;
+
+    for (i = 0, scopeP = psiScopeList.scNext; i < psiScopeCnt; i++, scopeP = scopeP->scNext)
+    {
+        noway_assert(scopeP != nullptr);
+        noway_assert(scopeP->scStartLoc.Valid());
+        noway_assert(scopeP->scEndLoc.Valid());
+
+        UNATIVE_OFFSET startOffs = scopeP->scStartLoc.CodeOffset(getEmitter());
+        UNATIVE_OFFSET endOffs   = scopeP->scEndLoc.CodeOffset(getEmitter());
+
+        unsigned varNum = scopeP->scSlotNum;
+        noway_assert(startOffs <= endOffs);
+
+        // The range may be 0 if the prolog is empty. For such a case,
+        // report the liveness of arguments to span at least the first
+        // instruction in the method. This will be incorrect (except on
+        // entry to the method) if the very first instruction of the method
+        // is part of a loop. However, this should happen
+        // very rarely, and the incorrectness is worth being able to look
+        // at the argument on entry to the method.
+        if (startOffs == endOffs)
+        {
+            noway_assert(startOffs == 0);
+            endOffs++;
+        }
+
+        Compiler::siVarLoc varLoc;
+
+        if (scopeP->scRegister)
+        {
+            varLoc.vlType       = Compiler::VLT_REG;
+            varLoc.vlReg.vlrReg = (regNumber)scopeP->u1.scRegNum;
+        }
+        else
+        {
+            varLoc.vlType           = Compiler::VLT_STK;
+            varLoc.vlStk.vlsBaseReg = (regNumber)scopeP->u2.scBaseReg;
+            varLoc.vlStk.vlsOffset  = scopeP->u2.scOffset;
+        }
+
+        genSetScopeInfo(i, startOffs, endOffs - startOffs, varNum, scopeP->scLVnum, true, varLoc);
+    }
+
+    // Record the scopes for the rest of the method.
+    // Check that the LocalVarInfo scopes look OK
+    noway_assert(siOpenScopeList.scNext == nullptr);
+
+    CodeGen::siScope* scopeL;
+
+    for (i = 0, scopeL = siScopeList.scNext; i < siScopeCnt; i++, scopeL = scopeL->scNext)
+    {
+        noway_assert(scopeL != nullptr);
+        noway_assert(scopeL->scStartLoc.Valid());
+        noway_assert(scopeL->scEndLoc.Valid());
+
+        // Find the start and end IP
+
+        UNATIVE_OFFSET startOffs = scopeL->scStartLoc.CodeOffset(getEmitter());
+        UNATIVE_OFFSET endOffs   = scopeL->scEndLoc.CodeOffset(getEmitter());
+
+        noway_assert(scopeL->scStartLoc != scopeL->scEndLoc);
+
+        // For stack vars, find the base register, and offset
+
+        regNumber baseReg;
+        signed    offset = compiler->lvaTable[scopeL->scVarNum].lvStkOffs;
+
+        if (!compiler->lvaTable[scopeL->scVarNum].lvFramePointerBased)
+        {
+            baseReg = REG_SPBASE;
+            offset += scopeL->scStackLevel;
+        }
+        else
+        {
+            baseReg = REG_FPBASE;
+        }
+
+        // Now fill in the varLoc
+
+        Compiler::siVarLoc varLoc;
+
+        // TODO-Review: This only works for always-enregistered variables. With LSRA, a variable might be in a register
+        // for part of its lifetime, or in different registers for different parts of its lifetime.
+        // This should only matter for non-debug code, where we do variable enregistration.
+        // We should store the ranges of variable enregistration in the scope table.
+        if (compiler->lvaTable[scopeL->scVarNum].lvIsInReg())
+        {
+            var_types type = genActualType(compiler->lvaTable[scopeL->scVarNum].TypeGet());
+            switch (type)
+            {
+                case TYP_INT:
+                case TYP_REF:
+                case TYP_BYREF:
+#ifdef _TARGET_64BIT_
+                case TYP_LONG:
+#endif // _TARGET_64BIT_
+
+                    varLoc.vlType       = Compiler::VLT_REG;
+                    varLoc.vlReg.vlrReg = compiler->lvaTable[scopeL->scVarNum].lvRegNum;
+                    break;
+
+#ifndef _TARGET_64BIT_
+                case TYP_LONG:
+#if !CPU_HAS_FP_SUPPORT
+                case TYP_DOUBLE:
+#endif
+
+                    if (compiler->lvaTable[scopeL->scVarNum].lvOtherReg != REG_STK)
+                    {
+                        varLoc.vlType            = Compiler::VLT_REG_REG;
+                        varLoc.vlRegReg.vlrrReg1 = compiler->lvaTable[scopeL->scVarNum].lvRegNum;
+                        varLoc.vlRegReg.vlrrReg2 = compiler->lvaTable[scopeL->scVarNum].lvOtherReg;
+                    }
+                    else
+                    {
+                        varLoc.vlType                        = Compiler::VLT_REG_STK;
+                        varLoc.vlRegStk.vlrsReg              = compiler->lvaTable[scopeL->scVarNum].lvRegNum;
+                        varLoc.vlRegStk.vlrsStk.vlrssBaseReg = baseReg;
+                        if (!isFramePointerUsed() && varLoc.vlRegStk.vlrsStk.vlrssBaseReg == REG_SPBASE)
+                        {
+                            varLoc.vlRegStk.vlrsStk.vlrssBaseReg = (regNumber)ICorDebugInfo::REGNUM_AMBIENT_SP;
+                        }
+                        varLoc.vlRegStk.vlrsStk.vlrssOffset = offset + sizeof(int);
+                    }
+                    break;
+#endif // !_TARGET_64BIT_
+
+#ifdef _TARGET_64BIT_
+
+                case TYP_FLOAT:
+                case TYP_DOUBLE:
+                    // TODO-AMD64-Bug: ndp\clr\src\inc\corinfo.h has a definition of RegNum that only goes up to R15,
+                    // so no XMM registers can get debug information.
+                    varLoc.vlType       = Compiler::VLT_REG_FP;
+                    varLoc.vlReg.vlrReg = compiler->lvaTable[scopeL->scVarNum].lvRegNum;
+                    break;
+
+#else // !_TARGET_64BIT_
+
+#if CPU_HAS_FP_SUPPORT
+                case TYP_FLOAT:
+                case TYP_DOUBLE:
+                    if (isFloatRegType(type))
+                    {
+                        varLoc.vlType         = Compiler::VLT_FPSTK;
+                        varLoc.vlFPstk.vlfReg = compiler->lvaTable[scopeL->scVarNum].lvRegNum;
+                    }
+                    break;
+#endif // CPU_HAS_FP_SUPPORT
+
+#endif // !_TARGET_64BIT_
+
+#ifdef FEATURE_SIMD
+                case TYP_SIMD8:
+                case TYP_SIMD12:
+                case TYP_SIMD16:
+                case TYP_SIMD32:
+                    varLoc.vlType = Compiler::VLT_REG_FP;
+
+                    // TODO-AMD64-Bug: ndp\clr\src\inc\corinfo.h has a definition of RegNum that only goes up to R15,
+                    // so no XMM registers can get debug information.
+                    //
+                    // Note: Need to initialize vlrReg field, otherwise during jit dump hitting an assert
+                    // in eeDispVar() --> getRegName() that regNumber is valid.
+                    varLoc.vlReg.vlrReg = compiler->lvaTable[scopeL->scVarNum].lvRegNum;
+                    break;
+#endif // FEATURE_SIMD
+
+                default:
+                    noway_assert(!"Invalid type");
+            }
+        }
+        else
+        {
+            assert(offset != BAD_STK_OFFS);
+            LclVarDsc* varDsc = compiler->lvaTable + scopeL->scVarNum;
+            switch (genActualType(varDsc->TypeGet()))
+            {
+                case TYP_INT:
+                case TYP_REF:
+                case TYP_BYREF:
+                case TYP_FLOAT:
+                case TYP_STRUCT:
+                case TYP_BLK: // Needed because of the TYP_BLK stress mode
+#ifdef FEATURE_SIMD
+                case TYP_SIMD8:
+                case TYP_SIMD12:
+                case TYP_SIMD16:
+                case TYP_SIMD32:
+#endif
+#ifdef _TARGET_64BIT_
+                case TYP_LONG:
+                case TYP_DOUBLE:
+#endif // _TARGET_64BIT_
+#if defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_)
+                    // In the AMD64 ABI we are supposed to pass a struct by reference when its
+                    // size is not 1, 2, 4 or 8 bytes in size. During fgMorph, the compiler modifies
+                    // the IR to comply with the ABI and therefore changes the type of the lclVar
+                    // that holds the struct from TYP_STRUCT to TYP_BYREF but it gives us a hint that
+                    // this is still a struct by setting the lvIsTemp flag.
+                    // The same is true for ARM64 and structs > 16 bytes.
+                    // (See Compiler::fgMarkImplicitByRefArgs in Morph.cpp for further detail)
+                    // Now, the VM expects a special enum for these type of local vars: VLT_STK_BYREF
+                    // to accomodate for this situation.
+                    if (varDsc->lvType == TYP_BYREF && varDsc->lvIsTemp)
+                    {
+                        assert(varDsc->lvIsParam);
+                        varLoc.vlType = Compiler::VLT_STK_BYREF;
+                    }
+                    else
+#endif // defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_)
+                    {
+                        varLoc.vlType = Compiler::VLT_STK;
+                    }
+                    varLoc.vlStk.vlsBaseReg = baseReg;
+                    varLoc.vlStk.vlsOffset  = offset;
+                    if (!isFramePointerUsed() && varLoc.vlStk.vlsBaseReg == REG_SPBASE)
+                    {
+                        varLoc.vlStk.vlsBaseReg = (regNumber)ICorDebugInfo::REGNUM_AMBIENT_SP;
+                    }
+                    break;
+
+#ifndef _TARGET_64BIT_
+                case TYP_LONG:
+                case TYP_DOUBLE:
+                    varLoc.vlType             = Compiler::VLT_STK2;
+                    varLoc.vlStk2.vls2BaseReg = baseReg;
+                    varLoc.vlStk2.vls2Offset  = offset;
+                    if (!isFramePointerUsed() && varLoc.vlStk2.vls2BaseReg == REG_SPBASE)
+                    {
+                        varLoc.vlStk2.vls2BaseReg = (regNumber)ICorDebugInfo::REGNUM_AMBIENT_SP;
+                    }
+                    break;
+#endif // !_TARGET_64BIT_
+
+                default:
+                    noway_assert(!"Invalid type");
+            }
+        }
+
+        genSetScopeInfo(psiScopeCnt + i, startOffs, endOffs - startOffs, scopeL->scVarNum, scopeL->scLVnum,
+                        scopeL->scAvailable, varLoc);
+    }
+
+    compiler->eeSetLVdone();
+}
+
+/*****************************************************************************/
+#ifdef LATE_DISASM
+#if defined(DEBUG)
+/*****************************************************************************
+ *                          CompilerRegName
+ *
+ * Can be called only after lviSetLocalVarInfo() has been called
+ */
+
+/* virtual */
+const char* CodeGen::siRegVarName(size_t offs, size_t size, unsigned reg)
+{
+    if (!compiler->opts.compScopeInfo)
+        return nullptr;
+
+    if (compiler->info.compVarScopesCount == 0)
+        return nullptr;
+
+    noway_assert(genTrnslLocalVarCount == 0 || genTrnslLocalVarInfo);
+
+    for (unsigned i = 0; i < genTrnslLocalVarCount; i++)
+    {
+        if ((genTrnslLocalVarInfo[i].tlviVarLoc.vlIsInReg((regNumber)reg)) &&
+            (genTrnslLocalVarInfo[i].tlviAvailable == true) && (genTrnslLocalVarInfo[i].tlviStartPC <= offs + size) &&
+            (genTrnslLocalVarInfo[i].tlviStartPC + genTrnslLocalVarInfo[i].tlviLength > offs))
+        {
+            return genTrnslLocalVarInfo[i].tlviName ? compiler->VarNameToStr(genTrnslLocalVarInfo[i].tlviName) : NULL;
+        }
+    }
+
+    return NULL;
+}
+
+/*****************************************************************************
+ *                          CompilerStkName
+ *
+ * Can be called only after lviSetLocalVarInfo() has been called
+ */
+
+/* virtual */
+const char* CodeGen::siStackVarName(size_t offs, size_t size, unsigned reg, unsigned stkOffs)
+{
+    if (!compiler->opts.compScopeInfo)
+        return nullptr;
+
+    if (compiler->info.compVarScopesCount == 0)
+        return nullptr;
+
+    noway_assert(genTrnslLocalVarCount == 0 || genTrnslLocalVarInfo);
+
+    for (unsigned i = 0; i < genTrnslLocalVarCount; i++)
+    {
+        if ((genTrnslLocalVarInfo[i].tlviVarLoc.vlIsOnStk((regNumber)reg, stkOffs)) &&
+            (genTrnslLocalVarInfo[i].tlviAvailable == true) && (genTrnslLocalVarInfo[i].tlviStartPC <= offs + size) &&
+            (genTrnslLocalVarInfo[i].tlviStartPC + genTrnslLocalVarInfo[i].tlviLength > offs))
+        {
+            return genTrnslLocalVarInfo[i].tlviName ? compiler->VarNameToStr(genTrnslLocalVarInfo[i].tlviName) : NULL;
+        }
+    }
+
+    return NULL;
+}
+
+/*****************************************************************************/
+#endif // defined(DEBUG)
+#endif // LATE_DISASM
+
+#ifdef DEBUG
+
+/*****************************************************************************
+ *  Display a IPmappingDsc. Pass -1 as mappingNum to not display a mapping number.
+ */
+
+void CodeGen::genIPmappingDisp(unsigned mappingNum, Compiler::IPmappingDsc* ipMapping)
+{
+    if (mappingNum != unsigned(-1))
+    {
+        printf("%d: ", mappingNum);
+    }
+
+    IL_OFFSETX offsx = ipMapping->ipmdILoffsx;
+
+    if (offsx == BAD_IL_OFFSET)
+    {
+        printf("???");
+    }
+    else
+    {
+        Compiler::eeDispILOffs(jitGetILoffsAny(offsx));
+
+        if (jitIsStackEmpty(offsx))
+        {
+            printf(" STACK_EMPTY");
+        }
+
+        if (jitIsCallInstruction(offsx))
+        {
+            printf(" CALL_INSTRUCTION");
+        }
+    }
+
+    printf(" ");
+    ipMapping->ipmdNativeLoc.Print();
+    // We can only call this after code generation. Is there any way to tell when it's legal to call?
+    // printf(" [%x]", ipMapping->ipmdNativeLoc.CodeOffset(getEmitter()));
+
+    if (ipMapping->ipmdIsLabel)
+    {
+        printf(" label");
+    }
+
+    printf("\n");
+}
+
+void CodeGen::genIPmappingListDisp()
+{
+    unsigned                mappingNum = 0;
+    Compiler::IPmappingDsc* ipMapping;
+
+    for (ipMapping = compiler->genIPmappingList; ipMapping != nullptr; ipMapping = ipMapping->ipmdNext)
+    {
+        genIPmappingDisp(mappingNum, ipMapping);
+        ++mappingNum;
+    }
+}
+
+#endif // DEBUG
+
+/*****************************************************************************
+ *
+ *  Append an IPmappingDsc struct to the list that we're maintaining
+ *  for the debugger.
+ *  Record the instr offset as being at the current code gen position.
+ */
+
+void CodeGen::genIPmappingAdd(IL_OFFSETX offsx, bool isLabel)
+{
+    if (!compiler->opts.compDbgInfo)
+    {
+        return;
+    }
+
+    assert(offsx != BAD_IL_OFFSET);
+
+    switch ((int)offsx) // Need the cast since offs is unsigned and the case statements are comparing to signed.
+    {
+        case ICorDebugInfo::PROLOG:
+        case ICorDebugInfo::EPILOG:
+            break;
+
+        default:
+
+            if (offsx != ICorDebugInfo::NO_MAPPING)
+            {
+                noway_assert(jitGetILoffs(offsx) <= compiler->info.compILCodeSize);
+            }
+
+            // Ignore this one if it's the same IL offset as the last one we saw.
+            // Note that we'll let through two identical IL offsets if the flag bits
+            // differ, or two identical "special" mappings (e.g., PROLOG).
+            if ((compiler->genIPmappingLast != nullptr) && (offsx == compiler->genIPmappingLast->ipmdILoffsx))
+            {
+                JITDUMP("genIPmappingAdd: ignoring duplicate IL offset 0x%x\n", offsx);
+                return;
+            }
+            break;
+    }
+
+    /* Create a mapping entry and append it to the list */
+
+    Compiler::IPmappingDsc* addMapping =
+        (Compiler::IPmappingDsc*)compiler->compGetMem(sizeof(*addMapping), CMK_DebugInfo);
+
+    addMapping->ipmdNativeLoc.CaptureLocation(getEmitter());
+    addMapping->ipmdILoffsx = offsx;
+    addMapping->ipmdIsLabel = isLabel;
+    addMapping->ipmdNext    = nullptr;
+
+    if (compiler->genIPmappingList != nullptr)
+    {
+        assert(compiler->genIPmappingLast != nullptr);
+        assert(compiler->genIPmappingLast->ipmdNext == nullptr);
+        compiler->genIPmappingLast->ipmdNext = addMapping;
+    }
+    else
+    {
+        assert(compiler->genIPmappingLast == nullptr);
+        compiler->genIPmappingList = addMapping;
+    }
+
+    compiler->genIPmappingLast = addMapping;
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("Added IP mapping: ");
+        genIPmappingDisp(unsigned(-1), addMapping);
+    }
+#endif // DEBUG
+}
+
+/*****************************************************************************
+ *
+ *  Prepend an IPmappingDsc struct to the list that we're maintaining
+ *  for the debugger.
+ *  Record the instr offset as being at the current code gen position.
+ */
+void CodeGen::genIPmappingAddToFront(IL_OFFSETX offsx)
+{
+    if (!compiler->opts.compDbgInfo)
+    {
+        return;
+    }
+
+    assert(offsx != BAD_IL_OFFSET);
+    assert(compiler->compGeneratingProlog); // We only ever do this during prolog generation.
+
+    switch ((int)offsx) // Need the cast since offs is unsigned and the case statements are comparing to signed.
+    {
+        case ICorDebugInfo::NO_MAPPING:
+        case ICorDebugInfo::PROLOG:
+        case ICorDebugInfo::EPILOG:
+            break;
+
+        default:
+            noway_assert(jitGetILoffs(offsx) <= compiler->info.compILCodeSize);
+            break;
+    }
+
+    /* Create a mapping entry and prepend it to the list */
+
+    Compiler::IPmappingDsc* addMapping =
+        (Compiler::IPmappingDsc*)compiler->compGetMem(sizeof(*addMapping), CMK_DebugInfo);
+
+    addMapping->ipmdNativeLoc.CaptureLocation(getEmitter());
+    addMapping->ipmdILoffsx = offsx;
+    addMapping->ipmdIsLabel = true;
+    addMapping->ipmdNext    = nullptr;
+
+    addMapping->ipmdNext       = compiler->genIPmappingList;
+    compiler->genIPmappingList = addMapping;
+
+    if (compiler->genIPmappingLast == nullptr)
+    {
+        compiler->genIPmappingLast = addMapping;
+    }
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("Added IP mapping to front: ");
+        genIPmappingDisp(unsigned(-1), addMapping);
+    }
+#endif // DEBUG
+}
+
+/*****************************************************************************/
+
+C_ASSERT(IL_OFFSETX(ICorDebugInfo::NO_MAPPING) != IL_OFFSETX(BAD_IL_OFFSET));
+C_ASSERT(IL_OFFSETX(ICorDebugInfo::PROLOG) != IL_OFFSETX(BAD_IL_OFFSET));
+C_ASSERT(IL_OFFSETX(ICorDebugInfo::EPILOG) != IL_OFFSETX(BAD_IL_OFFSET));
+
+C_ASSERT(IL_OFFSETX(BAD_IL_OFFSET) > MAX_IL_OFFSET);
+C_ASSERT(IL_OFFSETX(ICorDebugInfo::NO_MAPPING) > MAX_IL_OFFSET);
+C_ASSERT(IL_OFFSETX(ICorDebugInfo::PROLOG) > MAX_IL_OFFSET);
+C_ASSERT(IL_OFFSETX(ICorDebugInfo::EPILOG) > MAX_IL_OFFSET);
+
+//------------------------------------------------------------------------
+// jitGetILoffs: Returns the IL offset portion of the IL_OFFSETX type.
+//      Asserts if any ICorDebugInfo distinguished value (like ICorDebugInfo::NO_MAPPING)
+//      is seen; these are unexpected here. Also asserts if passed BAD_IL_OFFSET.
+//
+// Arguments:
+//    offsx - the IL_OFFSETX value with the IL offset to extract.
+//
+// Return Value:
+//    The IL offset.
+
+IL_OFFSET jitGetILoffs(IL_OFFSETX offsx)
+{
+    assert(offsx != BAD_IL_OFFSET);
+
+    switch ((int)offsx) // Need the cast since offs is unsigned and the case statements are comparing to signed.
+    {
+        case ICorDebugInfo::NO_MAPPING:
+        case ICorDebugInfo::PROLOG:
+        case ICorDebugInfo::EPILOG:
+            unreached();
+
+        default:
+            return IL_OFFSET(offsx & ~IL_OFFSETX_BITS);
+    }
+}
+
+//------------------------------------------------------------------------
+// jitGetILoffsAny: Similar to jitGetILoffs(), but passes through ICorDebugInfo
+//      distinguished values. Asserts if passed BAD_IL_OFFSET.
+//
+// Arguments:
+//    offsx - the IL_OFFSETX value with the IL offset to extract.
+//
+// Return Value:
+//    The IL offset.
+
+IL_OFFSET jitGetILoffsAny(IL_OFFSETX offsx)
+{
+    assert(offsx != BAD_IL_OFFSET);
+
+    switch ((int)offsx) // Need the cast since offs is unsigned and the case statements are comparing to signed.
+    {
+        case ICorDebugInfo::NO_MAPPING:
+        case ICorDebugInfo::PROLOG:
+        case ICorDebugInfo::EPILOG:
+            return IL_OFFSET(offsx);
+
+        default:
+            return IL_OFFSET(offsx & ~IL_OFFSETX_BITS);
+    }
+}
+
+//------------------------------------------------------------------------
+// jitIsStackEmpty: Does the IL offset have the stack empty bit set?
+//      Asserts if passed BAD_IL_OFFSET.
+//
+// Arguments:
+//    offsx - the IL_OFFSETX value to check
+//
+// Return Value:
+//    'true' if the stack empty bit is set; 'false' otherwise.
+
+bool jitIsStackEmpty(IL_OFFSETX offsx)
+{
+    assert(offsx != BAD_IL_OFFSET);
+
+    switch ((int)offsx) // Need the cast since offs is unsigned and the case statements are comparing to signed.
+    {
+        case ICorDebugInfo::NO_MAPPING:
+        case ICorDebugInfo::PROLOG:
+        case ICorDebugInfo::EPILOG:
+            return true;
+
+        default:
+            return (offsx & IL_OFFSETX_STKBIT) == 0;
+    }
+}
+
+//------------------------------------------------------------------------
+// jitIsCallInstruction: Does the IL offset have the call instruction bit set?
+//      Asserts if passed BAD_IL_OFFSET.
+//
+// Arguments:
+//    offsx - the IL_OFFSETX value to check
+//
+// Return Value:
+//    'true' if the call instruction bit is set; 'false' otherwise.
+
+bool jitIsCallInstruction(IL_OFFSETX offsx)
+{
+    assert(offsx != BAD_IL_OFFSET);
+
+    switch ((int)offsx) // Need the cast since offs is unsigned and the case statements are comparing to signed.
+    {
+        case ICorDebugInfo::NO_MAPPING:
+        case ICorDebugInfo::PROLOG:
+        case ICorDebugInfo::EPILOG:
+            return false;
+
+        default:
+            return (offsx & IL_OFFSETX_CALLINSTRUCTIONBIT) != 0;
+    }
+}
+
+/*****************************************************************************/
+
+void CodeGen::genEnsureCodeEmitted(IL_OFFSETX offsx)
+{
+    if (!compiler->opts.compDbgCode)
+    {
+        return;
+    }
+
+    if (offsx == BAD_IL_OFFSET)
+    {
+        return;
+    }
+
+    /* If other IL were offsets reported, skip */
+
+    if (compiler->genIPmappingLast == nullptr)
+    {
+        return;
+    }
+
+    if (compiler->genIPmappingLast->ipmdILoffsx != offsx)
+    {
+        return;
+    }
+
+    /* offsx was the last reported offset. Make sure that we generated native code */
+
+    if (compiler->genIPmappingLast->ipmdNativeLoc.IsCurrentLocation(getEmitter()))
+    {
+        instGen(INS_nop);
+    }
+}
+
+/*****************************************************************************
+ *
+ *  Shut down the IP-mapping logic, report the info to the EE.
+ */
+
+void CodeGen::genIPmappingGen()
+{
+    if (!compiler->opts.compDbgInfo)
+    {
+        return;
+    }
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** In genIPmappingGen()\n");
+    }
+#endif
+
+    if (compiler->genIPmappingList == nullptr)
+    {
+        compiler->eeSetLIcount(0);
+        compiler->eeSetLIdone();
+        return;
+    }
+
+    Compiler::IPmappingDsc* tmpMapping;
+    Compiler::IPmappingDsc* prevMapping;
+    unsigned                mappingCnt;
+    UNATIVE_OFFSET          lastNativeOfs;
+
+    /* First count the number of distinct mapping records */
+
+    mappingCnt    = 0;
+    lastNativeOfs = UNATIVE_OFFSET(~0);
+
+    for (prevMapping = nullptr, tmpMapping = compiler->genIPmappingList; tmpMapping != nullptr;
+         tmpMapping = tmpMapping->ipmdNext)
+    {
+        IL_OFFSETX srcIP = tmpMapping->ipmdILoffsx;
+
+        // Managed RetVal - since new sequence points are emitted to identify IL calls,
+        // make sure that those are not filtered and do not interfere with filtering of
+        // other sequence points.
+        if (jitIsCallInstruction(srcIP))
+        {
+            mappingCnt++;
+            continue;
+        }
+
+        UNATIVE_OFFSET nextNativeOfs = tmpMapping->ipmdNativeLoc.CodeOffset(getEmitter());
+
+        if (nextNativeOfs != lastNativeOfs)
+        {
+            mappingCnt++;
+            lastNativeOfs = nextNativeOfs;
+            prevMapping   = tmpMapping;
+            continue;
+        }
+
+        /* If there are mappings with the same native offset, then:
+           o If one of them is NO_MAPPING, ignore it
+           o If one of them is a label, report that and ignore the other one
+           o Else report the higher IL offset
+         */
+
+        PREFIX_ASSUME(prevMapping != nullptr); // We would exit before if this was true
+        if (prevMapping->ipmdILoffsx == (IL_OFFSETX)ICorDebugInfo::NO_MAPPING)
+        {
+            // If the previous entry was NO_MAPPING, ignore it
+            prevMapping->ipmdNativeLoc.Init();
+            prevMapping = tmpMapping;
+        }
+        else if (srcIP == (IL_OFFSETX)ICorDebugInfo::NO_MAPPING)
+        {
+            // If the current entry is NO_MAPPING, ignore it
+            // Leave prevMapping unchanged as tmpMapping is no longer valid
+            tmpMapping->ipmdNativeLoc.Init();
+        }
+        else if (srcIP == (IL_OFFSETX)ICorDebugInfo::EPILOG || srcIP == 0)
+        {
+            // counting for special cases: see below
+            mappingCnt++;
+            prevMapping = tmpMapping;
+        }
+        else
+        {
+            noway_assert(prevMapping != nullptr);
+            noway_assert(!prevMapping->ipmdNativeLoc.Valid() ||
+                         lastNativeOfs == prevMapping->ipmdNativeLoc.CodeOffset(getEmitter()));
+
+            /* The previous block had the same native offset. We have to
+               discard one of the mappings. Simply reinitialize ipmdNativeLoc
+               and prevMapping will be ignored later. */
+
+            if (prevMapping->ipmdIsLabel)
+            {
+                // Leave prevMapping unchanged as tmpMapping is no longer valid
+                tmpMapping->ipmdNativeLoc.Init();
+            }
+            else
+            {
+                prevMapping->ipmdNativeLoc.Init();
+                prevMapping = tmpMapping;
+            }
+        }
+    }
+
+    /* Tell them how many mapping records we've got */
+
+    compiler->eeSetLIcount(mappingCnt);
+
+    /* Now tell them about the mappings */
+
+    mappingCnt    = 0;
+    lastNativeOfs = UNATIVE_OFFSET(~0);
+
+    for (tmpMapping = compiler->genIPmappingList; tmpMapping != nullptr; tmpMapping = tmpMapping->ipmdNext)
+    {
+        // Do we have to skip this record ?
+        if (!tmpMapping->ipmdNativeLoc.Valid())
+        {
+            continue;
+        }
+
+        UNATIVE_OFFSET nextNativeOfs = tmpMapping->ipmdNativeLoc.CodeOffset(getEmitter());
+        IL_OFFSETX     srcIP         = tmpMapping->ipmdILoffsx;
+
+        if (jitIsCallInstruction(srcIP))
+        {
+            compiler->eeSetLIinfo(mappingCnt++, nextNativeOfs, jitGetILoffs(srcIP), jitIsStackEmpty(srcIP), true);
+        }
+        else if (nextNativeOfs != lastNativeOfs)
+        {
+            compiler->eeSetLIinfo(mappingCnt++, nextNativeOfs, jitGetILoffsAny(srcIP), jitIsStackEmpty(srcIP), false);
+            lastNativeOfs = nextNativeOfs;
+        }
+        else if (srcIP == (IL_OFFSETX)ICorDebugInfo::EPILOG || srcIP == 0)
+        {
+            // For the special case of an IL instruction with no body
+            // followed by the epilog (say ret void immediately preceding
+            // the method end), we put two entries in, so that we'll stop
+            // at the (empty) ret statement if the user tries to put a
+            // breakpoint there, and then have the option of seeing the
+            // epilog or not based on SetUnmappedStopMask for the stepper.
+            compiler->eeSetLIinfo(mappingCnt++, nextNativeOfs, jitGetILoffsAny(srcIP), jitIsStackEmpty(srcIP), false);
+        }
+    }
+
+#if 0
+    // TODO-Review:
+    //This check is disabled.  It is always true that any time this check asserts, the debugger would have a
+    //problem with IL source level debugging.  However, for a C# file, it only matters if things are on
+    //different source lines.  As a result, we have all sorts of latent problems with how we emit debug
+    //info, but very few actual ones.  Whenever someone wants to tackle that problem in general, turn this
+    //assert back on.
+    if (compiler->opts.compDbgCode)
+    {
+        //Assert that the first instruction of every basic block with more than one incoming edge has a
+        //different sequence point from each incoming block.
+        //
+        //It turns out that the only thing we really have to assert is that the first statement in each basic
+        //block has an IL offset and appears in eeBoundaries.
+        for (BasicBlock * block = compiler->fgFirstBB; block != nullptr; block = block->bbNext)
+        {
+            if ((block->bbRefs > 1) && (block->bbTreeList != nullptr))
+            {
+                noway_assert(block->bbTreeList->gtOper == GT_STMT);
+                bool found = false;
+                if (block->bbTreeList->gtStmt.gtStmtILoffsx != BAD_IL_OFFSET)
+                {
+                    IL_OFFSET ilOffs = jitGetILoffs(block->bbTreeList->gtStmt.gtStmtILoffsx);
+                    for (unsigned i = 0; i < eeBoundariesCount; ++i)
+                    {
+                        if (eeBoundaries[i].ilOffset == ilOffs)
+                        {
+                            found = true;
+                            break;
+                        }
+                    }
+                }
+                noway_assert(found && "A basic block that is a jump target did not start a new sequence point.");
+            }
+        }
+    }
+#endif // 0
+
+    compiler->eeSetLIdone();
+}
+
+#endif // DEBUGGING_SUPPORT
+
+/*============================================================================
+ *
+ *   These are empty stubs to help the late dis-assembler to compile
+ *   if DEBUGGING_SUPPORT is not enabled, or the late disassembler is being
+ *   built into a non-DEBUG build.
+ *
+ *============================================================================
+ */
+
+#if defined(LATE_DISASM)
+#if !defined(DEBUGGING_SUPPORT) || !defined(DEBUG)
+
+/* virtual */
+const char* CodeGen::siRegVarName(size_t offs, size_t size, unsigned reg)
+{
+    return NULL;
+}
+
+/* virtual */
+const char* CodeGen::siStackVarName(size_t offs, size_t size, unsigned reg, unsigned stkOffs)
+{
+    return NULL;
+}
+
+/*****************************************************************************/
+#endif // !defined(DEBUGGING_SUPPORT) || !defined(DEBUG)
+#endif // defined(LATE_DISASM)
+/*****************************************************************************/