1 files changed, 658 insertions, 84 deletions
diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp
index 09eb9146ac..a6e50b304c 100644
--- a/src/jit/lower.cpp
+++ b/src/jit/lower.cpp
@@ -135,6 +135,15 @@ GenTree* Lowering::LowerNode(GenTree* node)
             LowerCall(node);
             break;
 
+        case GT_LT:
+        case GT_LE:
+        case GT_GT:
+        case GT_GE:
+        case GT_EQ:
+        case GT_NE:
+            LowerCompare(node);
+            break;
+
         case GT_JMP:
             LowerJmpMethod(node);
             break;
@@ -169,13 +178,33 @@ GenTree* Lowering::LowerNode(GenTree* node)
                 // produces a TYP_SIMD16 result
                 node->gtType = TYP_SIMD16;
             }
+
+#ifdef _TARGET_XARCH_
+            if ((node->AsSIMD()->gtSIMDIntrinsicID == SIMDIntrinsicGetItem) && (node->gtGetOp1()->OperGet() == GT_IND))
+            {
+                // If SIMD vector is already in memory, we force its
+                // addr to be evaluated into a reg.  This would allow
+                // us to generate [regBase] or [regBase+offset] or
+                // [regBase+sizeOf(SIMD vector baseType)*regIndex]
+                // to access the required SIMD vector element directly
+                // from memory.
+                //
+                // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we
+                // might be able update GT_LEA to fold the regIndex
+                // or offset in some cases.  Instead with this
+                // approach we always evaluate GT_LEA into a reg.
+                // Ideally, we should be able to lower GetItem intrinsic
+                // into GT_IND(newAddr) where newAddr combines
+                // the addr of SIMD vector with the given index.
+                node->gtOp.gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
+            }
+#endif
             break;
 
         case GT_LCL_VAR:
         case GT_STORE_LCL_VAR:
             if (node->TypeGet() == TYP_SIMD12)
             {
-#ifdef _TARGET_64BIT_
                 // Assumption 1:
                 // RyuJit backend depends on the assumption that on 64-Bit targets Vector3 size is rounded off
                 // to TARGET_POINTER_SIZE and hence Vector3 locals on stack can be treated as TYP_SIMD16 for
@@ -198,10 +227,29 @@ GenTree* Lowering::LowerNode(GenTree* node)
                 // Vector3 return values are returned two return registers and Caller assembles them into a
                 // single xmm reg. Hence RyuJIT explicitly generates code to clears upper 4-bytes of Vector3
                 // type args in prolog and Vector3 type return value of a call
+                //
+                // RyuJIT x86 Windows: all non-param Vector3 local vars are allocated as 16 bytes. Vector3 arguments
+                // are pushed as 12 bytes. For return values, a 16-byte local is allocated and the address passed
+                // as a return buffer pointer. The callee doesn't write the high 4 bytes, and we don't need to clear
+                // it either.
+
+                unsigned   varNum = node->AsLclVarCommon()->GetLclNum();
+                LclVarDsc* varDsc = &comp->lvaTable[varNum];
+
+#if defined(_TARGET_64BIT_)
+                assert(varDsc->lvSize() == 16);
                 node->gtType = TYP_SIMD16;
-#else
-                NYI("Lowering of TYP_SIMD12 locals");
-#endif // _TARGET_64BIT_
+#else  // !_TARGET_64BIT_
+                if (varDsc->lvSize() == 16)
+                {
+                    node->gtType = TYP_SIMD16;
+                }
+                else
+                {
+                    // The following assert is guaranteed by lvSize().
+                    assert(varDsc->lvIsParam);
+                }
+#endif // !_TARGET_64BIT_
             }
 #endif // FEATURE_SIMD
             __fallthrough;
@@ -215,7 +263,7 @@ GenTree* Lowering::LowerNode(GenTree* node)
 #if FEATURE_MULTIREG_RET
                 GenTree* src = node->gtGetOp1();
                 assert((src->OperGet() == GT_CALL) && src->AsCall()->HasMultiRegRetVal());
-#else // !FEATURE_MULTIREG_RET
+#else  // !FEATURE_MULTIREG_RET
                 assert(!"Unexpected struct local store in Lowering");
 #endif // !FEATURE_MULTIREG_RET
             }
@@ -680,7 +728,7 @@ void Lowering::ReplaceArgWithPutArgOrCopy(GenTree** argSlot, GenTree* putArgOrCo
 // Arguments:
 //    call - the call whose arg is being rewritten.
 //    arg  - the arg being rewritten.
-//    info - the ArgTabEntry information for the argument.
+//    info - the fgArgTabEntry information for the argument.
 //    type - the type of the argument.
 //
 // Return Value:
@@ -692,11 +740,11 @@ void Lowering::ReplaceArgWithPutArgOrCopy(GenTree** argSlot, GenTree* putArgOrCo
 //
 // Notes:
 //    For System V systems with native struct passing (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined)
-//    this method allocates a single GT_PUTARG_REG for 1 eightbyte structs and a GT_LIST of two GT_PUTARG_REGs
+//    this method allocates a single GT_PUTARG_REG for 1 eightbyte structs and a GT_FIELD_LIST of two GT_PUTARG_REGs
 //    for two eightbyte structs.
 //
 //    For STK passed structs the method generates GT_PUTARG_STK tree. For System V systems with native struct passing
-//    (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined) this method also sets the GP pointers count and the pointers
+//    (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined) this method also sets the GC pointers count and the pointers
 //    layout object, so the codegen of the GT_PUTARG_STK could use this for optimizing copying to the stack by value.
 //    (using block copy primitives for non GC pointers and a single TARGET_POINTER_SIZE copy with recording GC info.)
 //
@@ -753,8 +801,8 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
             //    In this case a new tree is created that is GT_PUTARG_REG
             //    with a op1 the original argument.
             // 2. The struct is contained in 2 eightbytes:
-            //    in this case the arg comes as a GT_LIST of two GT_LCL_FLDs - the two eightbytes of the struct.
-            //    The code creates a GT_PUTARG_REG node for each GT_LCL_FLD in the GT_LIST
+            //    in this case the arg comes as a GT_FIELD_LIST of two GT_LCL_FLDs - the two eightbytes of the struct.
+            //    The code creates a GT_PUTARG_REG node for each GT_LCL_FLD in the GT_FIELD_LIST
             //    and splices it in the list with the corresponding original GT_LCL_FLD tree as op1.
 
             assert(info->structDesc.eightByteCount != 0);
@@ -826,25 +874,25 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
                 //
                 // clang-format on
 
-                assert(arg->OperGet() == GT_LIST);
+                assert(arg->OperGet() == GT_FIELD_LIST);
 
-                GenTreeArgList* argListPtr = arg->AsArgList();
-                assert(argListPtr->IsAggregate());
+                GenTreeFieldList* fieldListPtr = arg->AsFieldList();
+                assert(fieldListPtr->IsFieldListHead());
 
-                for (unsigned ctr = 0; argListPtr != nullptr; argListPtr = argListPtr->Rest(), ctr++)
+                for (unsigned ctr = 0; fieldListPtr != nullptr; fieldListPtr = fieldListPtr->Rest(), ctr++)
                 {
                     // Create a new GT_PUTARG_REG node with op1 the original GT_LCL_FLD.
                     GenTreePtr newOper = comp->gtNewOperNode(
                         GT_PUTARG_REG,
                         comp->GetTypeFromClassificationAndSizes(info->structDesc.eightByteClassifications[ctr],
                                                                 info->structDesc.eightByteSizes[ctr]),
-                        argListPtr->gtOp.gtOp1);
+                        fieldListPtr->gtOp.gtOp1);
 
-                    // Splice in the new GT_PUTARG_REG node in the GT_LIST
-                    ReplaceArgWithPutArgOrCopy(&argListPtr->gtOp.gtOp1, newOper);
+                    // Splice in the new GT_PUTARG_REG node in the GT_FIELD_LIST
+                    ReplaceArgWithPutArgOrCopy(&fieldListPtr->gtOp.gtOp1, newOper);
                 }
 
-                // Just return arg. The GT_LIST is not replaced.
+                // Just return arg. The GT_FIELD_LIST is not replaced.
                 // Nothing more to do.
                 return arg;
             }
@@ -857,26 +905,26 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
         else
 #else // not defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 #if FEATURE_MULTIREG_ARGS
-        if ((info->numRegs > 1) && (arg->OperGet() == GT_LIST))
+        if ((info->numRegs > 1) && (arg->OperGet() == GT_FIELD_LIST))
         {
-            assert(arg->OperGet() == GT_LIST);
+            assert(arg->OperGet() == GT_FIELD_LIST);
 
-            GenTreeArgList* argListPtr = arg->AsArgList();
-            assert(argListPtr->IsAggregate());
+            GenTreeFieldList* fieldListPtr = arg->AsFieldList();
+            assert(fieldListPtr->IsFieldListHead());
 
-            for (unsigned ctr = 0; argListPtr != nullptr; argListPtr = argListPtr->Rest(), ctr++)
+            for (unsigned ctr = 0; fieldListPtr != nullptr; fieldListPtr = fieldListPtr->Rest(), ctr++)
             {
-                GenTreePtr curOp  = argListPtr->gtOp.gtOp1;
+                GenTreePtr curOp  = fieldListPtr->gtOp.gtOp1;
                 var_types  curTyp = curOp->TypeGet();
 
                 // Create a new GT_PUTARG_REG node with op1
                 GenTreePtr newOper = comp->gtNewOperNode(GT_PUTARG_REG, curTyp, curOp);
 
-                // Splice in the new GT_PUTARG_REG node in the GT_LIST
-                ReplaceArgWithPutArgOrCopy(&argListPtr->gtOp.gtOp1, newOper);
+                // Splice in the new GT_PUTARG_REG node in the GT_FIELD_LIST
+                ReplaceArgWithPutArgOrCopy(&fieldListPtr->gtOp.gtOp1, newOper);
             }
 
-            // Just return arg. The GT_LIST is not replaced.
+            // Just return arg. The GT_FIELD_LIST is not replaced.
             // Nothing more to do.
             return arg;
         }
@@ -893,23 +941,20 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
         // This provides the info to put this argument in in-coming arg area slot
         // instead of in out-going arg area slot.
 
-        FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(assert(info->isStruct == varTypeIsStruct(type))); // Make sure state is
-                                                                                                 // correct
+        PUT_STRUCT_ARG_STK_ONLY(assert(info->isStruct == varTypeIsStruct(type))); // Make sure state is
+                                                                                  // correct
 
 #if FEATURE_FASTTAILCALL
         putArg = new (comp, GT_PUTARG_STK)
-            GenTreePutArgStk(GT_PUTARG_STK, type, arg,
-                             info->slotNum FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(info->numSlots)
-                                 FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(info->isStruct),
+            GenTreePutArgStk(GT_PUTARG_STK, type, arg, info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots),
                              call->IsFastTailCall() DEBUGARG(call));
 #else
         putArg = new (comp, GT_PUTARG_STK)
             GenTreePutArgStk(GT_PUTARG_STK, type, arg,
-                             info->slotNum FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(info->numSlots)
-                                 FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(info->isStruct) DEBUGARG(call));
+                             info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots) DEBUGARG(call));
 #endif
 
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
         // If the ArgTabEntry indicates that this arg is a struct
         // get and store the number of slots that are references.
         // This is later used in the codegen for PUT_ARG_STK implementation
@@ -919,8 +964,6 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
         // pair copying using XMM registers or rep mov instructions.
         if (info->isStruct)
         {
-            unsigned numRefs  = 0;
-            BYTE*    gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots];
             // We use GT_OBJ for non-SIMD struct arguments. However, for
             // SIMD arguments the GT_OBJ has already been transformed.
             if (arg->gtOper != GT_OBJ)
@@ -929,13 +972,14 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
             }
             else
             {
+                unsigned numRefs  = 0;
+                BYTE*    gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots];
                 assert(!varTypeIsSIMD(arg));
                 numRefs = comp->info.compCompHnd->getClassGClayout(arg->gtObj.gtClass, gcLayout);
+                putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout);
             }
-
-            putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout);
         }
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+#endif // FEATURE_PUT_STRUCT_ARG_STK
     }
 
     if (arg->InReg())
@@ -1011,6 +1055,22 @@ void Lowering::LowerArg(GenTreeCall* call, GenTreePtr* ppArg)
         type = TYP_INT;
     }
 
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+    // Non-param TYP_SIMD12 local var nodes are massaged in Lower to TYP_SIMD16 to match their
+    // allocated size (see lvSize()). However, when passing the variables as arguments, and
+    // storing the variables to the outgoing argument area on the stack, we must use their
+    // actual TYP_SIMD12 type, so exactly 12 bytes is allocated and written.
+    if (type == TYP_SIMD16)
+    {
+        if ((arg->OperGet() == GT_LCL_VAR) || (arg->OperGet() == GT_STORE_LCL_VAR))
+        {
+            unsigned   varNum = arg->AsLclVarCommon()->GetLclNum();
+            LclVarDsc* varDsc = &comp->lvaTable[varNum];
+            type              = varDsc->lvType;
+        }
+    }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+
     GenTreePtr putArg;
 
     // If we hit this we are probably double-lowering.
@@ -1068,7 +1128,7 @@ void Lowering::LowerArg(GenTreeCall* call, GenTreePtr* ppArg)
         putArg = NewPutArg(call, arg, info, type);
 
         // In the case of register passable struct (in one or two registers)
-        // the NewPutArg returns a new node (GT_PUTARG_REG or a GT_LIST with two GT_PUTARG_REGs.)
+        // the NewPutArg returns a new node (GT_PUTARG_REG or a GT_FIELD_LIST with two GT_PUTARG_REGs.)
         // If an extra node is returned, splice it in the right place in the tree.
         if (arg != putArg)
         {
@@ -1367,6 +1427,7 @@ void Lowering::CheckVSQuirkStackPaddingNeeded(GenTreeCall* call)
 
 // Inserts profiler hook, GT_PROF_HOOK for a tail call node.
 //
+// AMD64:
 // We need to insert this after all nested calls, but before all the arguments to this call have been set up.
 // To do this, we look for the first GT_PUTARG_STK or GT_PUTARG_REG, and insert the hook immediately before
 // that. If there are no args, then it should be inserted before the call node.
@@ -1391,16 +1452,30 @@ void Lowering::CheckVSQuirkStackPaddingNeeded(GenTreeCall* call)
 // In this case, the GT_PUTARG_REG src is a nested call. We need to put the instructions after that call
 // (as shown). We assume that of all the GT_PUTARG_*, only the first one can have a nested call.
 //
+// X86:
+// Insert the profiler hook immediately before the call. The profiler hook will preserve
+// all argument registers (ECX, EDX), but nothing else.
+//
 // Params:
 //    callNode        - tail call node
-//    insertionPoint  - if caller has an insertion point; If null
-//                      profiler hook is inserted before args are setup
+//    insertionPoint  - if non-null, insert the profiler hook before this point.
+//                      If null, insert the profiler hook before args are setup
 //                      but after all arg side effects are computed.
+//
 void Lowering::InsertProfTailCallHook(GenTreeCall* call, GenTree* insertionPoint)
 {
     assert(call->IsTailCall());
     assert(comp->compIsProfilerHookNeeded());
 
+#if defined(_TARGET_X86_)
+
+    if (insertionPoint == nullptr)
+    {
+        insertionPoint = call;
+    }
+
+#else // !defined(_TARGET_X86_)
+
     if (insertionPoint == nullptr)
     {
         GenTreePtr tmp = nullptr;
@@ -1437,6 +1512,8 @@ void Lowering::InsertProfTailCallHook(GenTreeCall* call, GenTree* insertionPoint
         }
     }
 
+#endif // !defined(_TARGET_X86_)
+
     assert(insertionPoint != nullptr);
     GenTreePtr profHookNode = new (comp, GT_PROF_HOOK) GenTree(GT_PROF_HOOK, TYP_VOID);
     BlockRange().InsertBefore(insertionPoint, profHookNode);
@@ -1705,7 +1782,10 @@ GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget
     assert(!comp->opts.compNeedSecurityCheck);               // tail call from methods that need security check
     assert(!call->IsUnmanaged());                            // tail calls to unamanaged methods
     assert(!comp->compLocallocUsed);                         // tail call from methods that also do localloc
-    assert(!comp->getNeedsGSSecurityCookie());               // jit64 compat: tail calls from methods that need GS check
+
+#ifdef _TARGET_AMD64_
+    assert(!comp->getNeedsGSSecurityCookie()); // jit64 compat: tail calls from methods that need GS check
+#endif                                         // _TARGET_AMD64_
 
     // We expect to see a call that meets the following conditions
     assert(call->IsTailCallViaHelper());
@@ -1713,8 +1793,9 @@ GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget
 
     // The TailCall helper call never returns to the caller and is not GC interruptible.
     // Therefore the block containing the tail call should be a GC safe point to avoid
-    // GC starvation.
-    assert(comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT);
+    // GC starvation. It is legal for the block to be unmarked iff the entry block is a
+    // GC safe point, as the entry block trivially dominates every reachable block.
+    assert((comp->compCurBB->bbFlags & BBF_GC_SAFE_POINT) || (comp->fgFirstBB->bbFlags & BBF_GC_SAFE_POINT));
 
     // If PInvokes are in-lined, we have to remember to execute PInvoke method epilog anywhere that
     // a method returns.  This is a case of caller method has both PInvokes and tail calls.
@@ -1839,16 +1920,268 @@ GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget
     // Now add back tail call flags for identifying this node as tail call dispatched via helper.
     call->gtCallMoreFlags |= GTF_CALL_M_TAILCALL | GTF_CALL_M_TAILCALL_VIA_HELPER;
 
+#ifdef PROFILING_SUPPORTED
     // Insert profiler tail call hook if needed.
     // Since we don't know the insertion point, pass null for second param.
     if (comp->compIsProfilerHookNeeded())
     {
         InsertProfTailCallHook(call, nullptr);
     }
+#endif // PROFILING_SUPPORTED
+
+    assert(call->IsTailCallViaHelper());
 
     return result;
 }
 
+//------------------------------------------------------------------------
+// Lowering::LowerCompare: lowers a compare node.
+//
+// For 64-bit targets, this doesn't do much of anything: all comparisons
+// that we support can be handled in code generation on such targets.
+//
+// For 32-bit targets, however, any comparison that feeds a `GT_JTRUE`
+// node must be lowered such that the liveness of the operands to the
+// comparison is properly visible to the rest of the backend. As such,
+// a 64-bit comparison is lowered from something like this:
+//
+//    ------------ BB02 [004..014) -> BB02 (cond), preds={BB02,BB01} succs={BB03,BB02}
+//    N001 (  1,  1) [000006] ------------        t6 =    lclVar    int    V02 loc0         u:5 $148
+//
+//                                                     /--*  t6     int
+//    N002 (  2,  3) [000007] ---------U--        t7 = *  cast      long <- ulong <- uint $3c0
+//
+//    N003 (  3, 10) [000009] ------------        t9 =    lconst    long   0x0000000000000003 $101
+//
+//                                                     /--*  t7     long
+//                                                     +--*  t9     long
+//    N004 (  9, 17) [000010] N------N-U--       t10 = *  <         int    $149
+//
+//                                                     /--*  t10    int
+//    N005 ( 11, 19) [000011] ------------             *  jmpTrue   void
+//
+// To something like this:
+//
+//    ------------ BB02 [004..014) -> BB03 (cond), preds={BB06,BB07,BB01} succs={BB06,BB03}
+//                   [000099] ------------       t99 =    const     int    0
+//
+//                   [000101] ------------      t101 =    const     int    0
+//
+//                                                     /--*  t99    int
+//                                                     +--*  t101   int
+//    N004 (  9, 17) [000010] N------N-U--       t10 = *  >         int    $149
+//
+//                                                     /--*  t10    int
+//    N005 ( 11, 19) [000011] ------------             *  jmpTrue   void
+//
+//
+//    ------------ BB06 [???..???) -> BB02 (cond), preds={BB02} succs={BB07,BB02}
+//                   [000105] -------N-U--                jcc       void   cond=<
+//
+//
+//    ------------ BB07 [???..???) -> BB02 (cond), preds={BB06} succs={BB03,BB02}
+//    N001 (  1,  1) [000006] ------------        t6 =    lclVar    int    V02 loc0         u:5 $148
+//
+//    N003 (  3, 10) [000009] ------------        t9 =    const     int    3
+//
+//                                                     /--*  t6     int
+//                                                     +--*  t9     int
+//                   [000106] N------N-U--      t106 = *  <         int
+//
+//                                                     /--*  t106   int
+//                   [000107] ------------             *  jmpTrue   void
+//
+// Which will eventually generate code similar to the following:
+//
+//    33DB         xor      ebx, ebx
+//    85DB         test     ebx, ebx
+//    7707         ja       SHORT G_M50523_IG04
+//    72E7         jb       SHORT G_M50523_IG03
+//    83F803       cmp      eax, 3
+//    72E2         jb       SHORT G_M50523_IG03
+//
+void Lowering::LowerCompare(GenTree* cmp)
+{
+#ifndef _TARGET_64BIT_
+    if (cmp->gtGetOp1()->TypeGet() != TYP_LONG)
+    {
+        return;
+    }
+
+    LIR::Use cmpUse;
+
+    if (!BlockRange().TryGetUse(cmp, &cmpUse) || cmpUse.User()->OperGet() != GT_JTRUE)
+    {
+        return;
+    }
+
+    GenTree* src1   = cmp->gtGetOp1();
+    GenTree* src2   = cmp->gtGetOp2();
+    unsigned weight = m_block->getBBWeight(comp);
+
+    LIR::Use loSrc1(BlockRange(), &(src1->gtOp.gtOp1), src1);
+    LIR::Use loSrc2(BlockRange(), &(src2->gtOp.gtOp1), src2);
+
+    if (loSrc1.Def()->OperGet() != GT_CNS_INT && loSrc1.Def()->OperGet() != GT_LCL_VAR)
+    {
+        loSrc1.ReplaceWithLclVar(comp, weight);
+    }
+
+    if (loSrc2.Def()->OperGet() != GT_CNS_INT && loSrc2.Def()->OperGet() != GT_LCL_VAR)
+    {
+        loSrc2.ReplaceWithLclVar(comp, weight);
+    }
+
+    BasicBlock* jumpDest = m_block->bbJumpDest;
+    BasicBlock* nextDest = m_block->bbNext;
+    BasicBlock* newBlock = comp->fgSplitBlockAtEnd(m_block);
+
+    cmp->gtType     = TYP_INT;
+    cmp->gtOp.gtOp1 = src1->gtOp.gtOp2;
+    cmp->gtOp.gtOp2 = src2->gtOp.gtOp2;
+
+    if (cmp->OperGet() == GT_EQ || cmp->OperGet() == GT_NE)
+    {
+        // 64-bit equality comparisons (no matter the polarity) require two 32-bit comparisons: one for the upper 32
+        // bits and one for the lower 32 bits. As such, we update the flow graph like so:
+        //
+        //     Before:
+        //                 BB0: cond
+        //                   /   \
+        //                false  true
+        //                  |     |
+        //                 BB1   BB2
+        //
+        //     After:
+        //                  BB0: cond(hi)
+        //                   /        \
+        //                false       true
+        //                  |          |
+        //                  |     BB3: cond(lo)
+        //                  |      /       \
+        //                  |   false      true
+        //                  \    /          |
+        //                    BB1          BB2
+        //
+
+        BlockRange().Remove(loSrc1.Def());
+        BlockRange().Remove(loSrc2.Def());
+        GenTree* loCmp   = comp->gtNewOperNode(cmp->OperGet(), TYP_INT, loSrc1.Def(), loSrc2.Def());
+        loCmp->gtFlags   = cmp->gtFlags;
+        GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp);
+        LIR::AsRange(newBlock).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue);
+
+        m_block->bbJumpKind = BBJ_COND;
+
+        if (cmp->OperGet() == GT_EQ)
+        {
+            cmp->gtOper         = GT_NE;
+            m_block->bbJumpDest = nextDest;
+            nextDest->bbFlags |= BBF_JMP_TARGET;
+            comp->fgAddRefPred(nextDest, m_block);
+        }
+        else
+        {
+            m_block->bbJumpDest = jumpDest;
+            comp->fgAddRefPred(jumpDest, m_block);
+        }
+
+        assert(newBlock->bbJumpKind == BBJ_COND);
+        assert(newBlock->bbJumpDest == jumpDest);
+    }
+    else
+    {
+        // 64-bit ordinal comparisons are more complicated: they require two comparisons for the upper 32 bits and one
+        // comparison for the lower 32 bits. We update the flowgraph as such:
+        //
+        //     Before:
+        //                 BB0: cond
+        //                   /   \
+        //                false  true
+        //                  |     |
+        //                 BB1   BB2
+        //
+        //     After:
+        //           BB0: (!cond(hi) && !eq(hi))
+        //               /                \
+        //             true              false
+        //              |                  |
+        //              |      BB3: (cond(hi) && !eq(hi))
+        //              |             /        \
+        //              |          false      true
+        //              |            |          |
+        //              |      BB4: cond(lo)    |
+        //              |       /         \     |
+        //              |    false        true  |
+        //              \     /             \   /
+        //                BB1                BB2
+        //
+        //
+        // Note that the actual comparisons used to implement "(!cond(hi) && !eq(hi))" and "(cond(hi) && !eq(hi))"
+        // differ based on the original condition, and all consist of a single node. The switch statement below
+        // performs the necessary mapping.
+        //
+
+        genTreeOps hiCmpOper;
+        genTreeOps loCmpOper;
+
+        switch (cmp->OperGet())
+        {
+            case GT_LT:
+                cmp->gtOper = GT_GT;
+                hiCmpOper   = GT_LT;
+                loCmpOper   = GT_LT;
+                break;
+            case GT_LE:
+                cmp->gtOper = GT_GT;
+                hiCmpOper   = GT_LT;
+                loCmpOper   = GT_LE;
+                break;
+            case GT_GT:
+                cmp->gtOper = GT_LT;
+                hiCmpOper   = GT_GT;
+                loCmpOper   = GT_GT;
+                break;
+            case GT_GE:
+                cmp->gtOper = GT_LT;
+                hiCmpOper   = GT_GT;
+                loCmpOper   = GT_GE;
+                break;
+            default:
+                unreached();
+        }
+
+        BasicBlock* newBlock2 = comp->fgSplitBlockAtEnd(newBlock);
+
+        GenTree* hiJcc = new (comp, GT_JCC) GenTreeJumpCC(hiCmpOper);
+        hiJcc->gtFlags = cmp->gtFlags;
+        LIR::AsRange(newBlock).InsertAfter(nullptr, hiJcc);
+
+        BlockRange().Remove(loSrc1.Def());
+        BlockRange().Remove(loSrc2.Def());
+        GenTree* loCmp   = comp->gtNewOperNode(loCmpOper, TYP_INT, loSrc1.Def(), loSrc2.Def());
+        loCmp->gtFlags   = cmp->gtFlags | GTF_UNSIGNED;
+        GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp);
+        LIR::AsRange(newBlock2).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue);
+
+        m_block->bbJumpKind = BBJ_COND;
+        m_block->bbJumpDest = nextDest;
+        nextDest->bbFlags |= BBF_JMP_TARGET;
+        comp->fgAddRefPred(nextDest, m_block);
+
+        newBlock->bbJumpKind = BBJ_COND;
+        newBlock->bbJumpDest = jumpDest;
+        comp->fgAddRefPred(jumpDest, newBlock);
+
+        assert(newBlock2->bbJumpKind == BBJ_COND);
+        assert(newBlock2->bbJumpDest == jumpDest);
+    }
+
+    BlockRange().Remove(src1);
+    BlockRange().Remove(src2);
+#endif
+}
+
 // Lower "jmp <method>" tail call to insert PInvoke method epilog if required.
 void Lowering::LowerJmpMethod(GenTree* jmp)
 {
@@ -2334,8 +2667,12 @@ void Lowering::InsertPInvokeMethodProlog()
     DISPTREERANGE(firstBlockRange, storeFP);
 
     // --------------------------------------------------------
+    // On 32-bit targets, CORINFO_HELP_INIT_PINVOKE_FRAME initializes the PInvoke frame and then pushes it onto
+    // the current thread's Frame stack. On 64-bit targets, it only initializes the PInvoke frame.
+    CLANG_FORMAT_COMMENT_ANCHOR;
 
-    if (comp->opts.eeFlags & CORJIT_FLG_IL_STUB)
+#ifdef _TARGET_64BIT_
+    if (comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
     {
         // Push a frame - if we are NOT in an IL stub, this is done right before the call
         // The init routine sets InlinedCallFrame's m_pNext, so we just set the thead's top-of-stack
@@ -2343,6 +2680,7 @@ void Lowering::InsertPInvokeMethodProlog()
         firstBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, frameUpd));
         DISPTREERANGE(firstBlockRange, frameUpd);
     }
+#endif // _TARGET_64BIT_
 }
 
 //------------------------------------------------------------------------
@@ -2405,9 +2743,14 @@ void Lowering::InsertPInvokeMethodEpilog(BasicBlock* returnBB DEBUGARG(GenTreePt
     GenTree* storeGCState = SetGCState(1);
     returnBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, storeGCState));
 
-    if (comp->opts.eeFlags & CORJIT_FLG_IL_STUB)
+    // Pop the frame if necessary. This always happens in the epilog on 32-bit targets. For 64-bit targets, we only do
+    // this in the epilog for IL stubs; for non-IL stubs the frame is popped after every PInvoke call.
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef _TARGET_64BIT_
+    if (comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
+#endif // _TARGET_64BIT_
     {
-        // Pop the frame, in non-stubs we do this around each PInvoke call
         GenTree* frameUpd = CreateFrameLinkUpdate(PopFrame);
         returnBlockRange.InsertBefore(insertionPoint, LIR::SeqTree(comp, frameUpd));
     }
@@ -2454,6 +2797,7 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call)
 
         comp->fgMorphTree(helperCall);
         BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, helperCall));
+        LowerNode(helperCall); // helper call is inserted before current node and should be lowered here.
         return;
     }
 #endif
@@ -2464,7 +2808,7 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call)
     // InlinedCallFrame.m_pCallSiteSP = SP          // x86 only
     // InlinedCallFrame.m_pCallerReturnAddress = return address
     // Thread.gcState = 0
-    // (non-stub) - update top Frame on TCB
+    // (non-stub) - update top Frame on TCB         // 64-bit targets only
 
     // ----------------------------------------------------------------------------------
     // Setup InlinedCallFrame.callSiteTarget (which is how the JIT refers to it).
@@ -2474,11 +2818,19 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call)
 
     if (callType == CT_INDIRECT)
     {
+#if !defined(_TARGET_64BIT_)
+        // On 32-bit targets, indirect calls need the size of the stack args in InlinedCallFrame.m_Datum.
+        const unsigned numStkArgBytes = call->fgArgInfo->GetNextSlotNum() * TARGET_POINTER_SIZE;
+
+        src = comp->gtNewIconNode(numStkArgBytes, TYP_INT);
+#else
+        // On 64-bit targets, indirect calls may need the stub parameter value in InlinedCallFrame.m_Datum.
+        // If the stub parameter value is not needed, m_Datum will be initialized by the VM.
         if (comp->info.compPublishStubParam)
         {
-            src = new (comp, GT_LCL_VAR) GenTreeLclVar(TYP_I_IMPL, comp->lvaStubArgumentVar, BAD_IL_OFFSET);
+            src = comp->gtNewLclvNode(comp->lvaStubArgumentVar, TYP_I_IMPL);
         }
-        // else { If we don't have secret parameter, m_Datum will be initialized by VM code }
+#endif // !defined(_TARGET_64BIT_)
     }
     else
     {
@@ -2542,7 +2894,12 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call)
 
     BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, storeLab));
 
-    if (!(comp->opts.eeFlags & CORJIT_FLG_IL_STUB))
+    // Push the PInvoke frame if necessary. On 32-bit targets this only happens in the method prolog if a method
+    // contains PInvokes; on 64-bit targets this is necessary in non-stubs.
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef _TARGET_64BIT_
+    if (!comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
     {
         // Set the TCB's frame to be the one we just created.
         // Note the init routine for the InlinedCallFrame (CORINFO_HELP_INIT_PINVOKE_FRAME)
@@ -2552,6 +2909,7 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call)
         GenTree* frameUpd = CreateFrameLinkUpdate(PushFrame);
         BlockRange().InsertBefore(insertBefore, LIR::SeqTree(comp, frameUpd));
     }
+#endif // _TARGET_64BIT_
 
     // IMPORTANT **** This instruction must come last!!! ****
     // It changes the thread's state to Preemptive mode
@@ -2583,7 +2941,7 @@ void Lowering::InsertPInvokeCallEpilog(GenTreeCall* call)
         // First argument is the address of the frame variable.
         GenTree* frameAddr =
             new (comp, GT_LCL_VAR) GenTreeLclVar(GT_LCL_VAR, TYP_BYREF, comp->lvaInlinedPInvokeFrameVar, BAD_IL_OFFSET);
-        frameAddr->gtOper = GT_LCL_VAR_ADDR;
+        frameAddr->SetOperRaw(GT_LCL_VAR_ADDR);
 
         // Insert call to CORINFO_HELP_JIT_PINVOKE_END
         GenTree* helperCall =
@@ -2604,12 +2962,32 @@ void Lowering::InsertPInvokeCallEpilog(GenTreeCall* call)
     tree = CreateReturnTrapSeq();
     BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree));
 
-    // Pop the frame if necessasry
-    if (!(comp->opts.eeFlags & CORJIT_FLG_IL_STUB))
+    // Pop the frame if necessary. On 32-bit targets this only happens in the method epilog; on 64-bit targets thi
+    // happens after every PInvoke call in non-stubs. 32-bit targets instead mark the frame as inactive.
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef _TARGET_64BIT_
+    if (!comp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_IL_STUB))
     {
         tree = CreateFrameLinkUpdate(PopFrame);
         BlockRange().InsertBefore(insertionPoint, LIR::SeqTree(comp, tree));
     }
+#else
+    const CORINFO_EE_INFO::InlinedCallFrameInfo& callFrameInfo = comp->eeGetEEInfo()->inlinedCallFrameInfo;
+
+    // ----------------------------------------------------------------------------------
+    // InlinedCallFrame.m_pCallerReturnAddress = nullptr
+
+    GenTreeLclFld* const storeCallSiteTracker =
+        new (comp, GT_STORE_LCL_FLD) GenTreeLclFld(GT_STORE_LCL_FLD, TYP_I_IMPL, comp->lvaInlinedPInvokeFrameVar,
+                                                   callFrameInfo.offsetOfReturnAddress);
+
+    GenTreeIntCon* const constantZero = new (comp, GT_CNS_INT) GenTreeIntCon(TYP_I_IMPL, 0);
+
+    storeCallSiteTracker->gtOp1 = constantZero;
+
+    BlockRange().InsertBefore(insertionPoint, constantZero, storeCallSiteTracker);
+#endif // _TARGET_64BIT_
 }
 
 //------------------------------------------------------------------------
@@ -2624,7 +3002,7 @@ void Lowering::InsertPInvokeCallEpilog(GenTreeCall* call)
 GenTree* Lowering::LowerNonvirtPinvokeCall(GenTreeCall* call)
 {
     // PInvoke lowering varies depending on the flags passed in by the EE. By default,
-    // GC transitions are generated inline; if CORJIT_FLG2_USE_PINVOKE_HELPERS is specified,
+    // GC transitions are generated inline; if CORJIT_FLAG_USE_PINVOKE_HELPERS is specified,
     // GC transitions are instead performed using helper calls. Examples of each case are given
     // below. Note that the data structure that is used to store information about a call frame
     // containing any P/Invoke calls is initialized in the method prolog (see
@@ -2697,7 +3075,7 @@ GenTree* Lowering::LowerNonvirtPinvokeCall(GenTreeCall* call)
 #if COR_JIT_EE_VERSION > 460
         comp->info.compCompHnd->getAddressOfPInvokeTarget(methHnd, &lookup);
 #else
-        void*          pIndirection;
+        void* pIndirection;
         lookup.accessType = IAT_PVALUE;
         lookup.addr       = comp->info.compCompHnd->getAddressOfPInvokeFixup(methHnd, &pIndirection);
         if (lookup.addr == nullptr)
@@ -2866,14 +3244,10 @@ GenTree* Lowering::LowerVirtualStubCall(GenTreeCall* call)
     }
 #endif
 
-    // TODO-Cleanup: Disable emitting random NOPs
-
     // This is code to set up an indirect call to a stub address computed
     // via dictionary lookup.
     if (call->gtCallType == CT_INDIRECT)
     {
-        NYI_X86("Virtual Stub dispatched call lowering via dictionary lookup");
-
         // The importer decided we needed a stub call via a computed
         // stub dispatch address, i.e. an address which came from a dictionary lookup.
         //   - The dictionary lookup produces an indirected address, suitable for call
@@ -2886,6 +3260,8 @@ GenTree* Lowering::LowerVirtualStubCall(GenTreeCall* call)
         // All we have to do here is add an indirection to generate the actual call target.
 
         GenTree* ind = Ind(call->gtCallAddr);
+        ind->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
+
         BlockRange().InsertAfter(call->gtCallAddr, ind);
         call->gtCallAddr = ind;
     }
@@ -2923,8 +3299,10 @@ GenTree* Lowering::LowerVirtualStubCall(GenTreeCall* call)
 // So we don't use a register.
 #ifndef _TARGET_X86_
             // on x64 we must materialize the target using specific registers.
-            addr->gtRegNum  = REG_VIRTUAL_STUB_PARAM;
+            addr->gtRegNum = REG_VIRTUAL_STUB_PARAM;
+
             indir->gtRegNum = REG_JUMP_THUNK_PARAM;
+            indir->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
 #endif
             result = indir;
         }
@@ -3042,8 +3420,6 @@ bool Lowering::AreSourcesPossiblyModifiedLocals(GenTree* addr, GenTree* base, Ge
             return true;
         }
     }
-
-    unreached();
 }
 
 //------------------------------------------------------------------------
@@ -3082,9 +3458,9 @@ GenTree* Lowering::TryCreateAddrMode(LIR::Use&& use, bool isIndir)
         {
             // We can have an indirection on the rhs of a block copy (it is the source
             // object). This is not a "regular" indirection.
-            // (Note that the parent check could be costly.)
-            GenTree* parent = indir->gtGetParent(nullptr);
-            if ((parent != nullptr) && parent->OperIsIndir())
+            // (Note that the user check could be costly.)
+            LIR::Use indirUse;
+            if (BlockRange().TryGetUse(indir, &indirUse) && indirUse.User()->OperIsIndir())
             {
                 isIndir = false;
             }
@@ -3248,9 +3624,14 @@ void Lowering::LowerUnsignedDivOrMod(GenTree* node)
 {
     assert((node->OperGet() == GT_UDIV) || (node->OperGet() == GT_UMOD));
 
-    GenTree* divisor = node->gtGetOp2();
+    GenTree* divisor  = node->gtGetOp2();
+    GenTree* dividend = node->gtGetOp1();
 
-    if (divisor->IsCnsIntOrI())
+    if (divisor->IsCnsIntOrI()
+#ifdef _TARGET_X86_
+        && (dividend->OperGet() != GT_LONG)
+#endif
+            )
     {
         size_t divisorValue = static_cast<size_t>(divisor->gtIntCon.IconValue());
 
@@ -3276,6 +3657,91 @@ void Lowering::LowerUnsignedDivOrMod(GenTree* node)
 }
 
 //------------------------------------------------------------------------
+// GetSignedMagicNumberForDivide: Generates a magic number and shift amount for
+// the magic number division optimization.
+//
+// Arguments:
+//    denom - The denominator
+//    shift - Pointer to the shift value to be returned
+//
+// Returns:
+//    The magic number.
+//
+// Notes:
+//    This code is previously from UTC where it notes it was taken from
+//   _The_PowerPC_Compiler_Writer's_Guide_, pages 57-58. The paper is is based on
+//   is "Division by invariant integers using multiplication" by Torbjorn Granlund
+//   and Peter L. Montgomery in PLDI 94
+
+template <typename T>
+T GetSignedMagicNumberForDivide(T denom, int* shift /*out*/)
+{
+    // static SMAG smag;
+    const int bits         = sizeof(T) * 8;
+    const int bits_minus_1 = bits - 1;
+
+    typedef typename jitstd::make_unsigned<T>::type UT;
+
+    const UT two_nminus1 = UT(1) << bits_minus_1;
+
+    int p;
+    UT  absDenom;
+    UT  absNc;
+    UT  delta;
+    UT  q1;
+    UT  r1;
+    UT  r2;
+    UT  q2;
+    UT  t;
+    T   result_magic;
+    int result_shift;
+    int iters = 0;
+
+    absDenom = abs(denom);
+    t        = two_nminus1 + ((unsigned int)denom >> 31);
+    absNc    = t - 1 - (t % absDenom);        // absolute value of nc
+    p        = bits_minus_1;                  // initialize p
+    q1       = two_nminus1 / absNc;           // initialize q1 = 2^p / abs(nc)
+    r1       = two_nminus1 - (q1 * absNc);    // initialize r1 = rem(2^p, abs(nc))
+    q2       = two_nminus1 / absDenom;        // initialize q1 = 2^p / abs(denom)
+    r2       = two_nminus1 - (q2 * absDenom); // initialize r1 = rem(2^p, abs(denom))
+
+    do
+    {
+        iters++;
+        p++;
+        q1 *= 2; // update q1 = 2^p / abs(nc)
+        r1 *= 2; // update r1 = rem(2^p / abs(nc))
+
+        if (r1 >= absNc)
+        { // must be unsigned comparison
+            q1++;
+            r1 -= absNc;
+        }
+
+        q2 *= 2; // update q2 = 2^p / abs(denom)
+        r2 *= 2; // update r2 = rem(2^p / abs(denom))
+
+        if (r2 >= absDenom)
+        { // must be unsigned comparison
+            q2++;
+            r2 -= absDenom;
+        }
+
+        delta = absDenom - r2;
+    } while (q1 < delta || (q1 == delta && r1 == 0));
+
+    result_magic = q2 + 1; // resulting magic number
+    if (denom < 0)
+    {
+        result_magic = -result_magic;
+    }
+    *shift = p - bits; // resulting shift
+
+    return result_magic;
+}
+
+//------------------------------------------------------------------------
 // LowerSignedDivOrMod: transform integer GT_DIV/GT_MOD nodes with a power of 2
 // const divisor into equivalent but faster sequences.
 //
@@ -3313,8 +3779,10 @@ GenTree* Lowering::LowerSignedDivOrMod(GenTreePtr node)
 
     ssize_t divisorValue = divisor->gtIntCon.IconValue();
 
-    if (divisorValue == -1)
+    if (divisorValue == -1 || divisorValue == 0)
     {
+        // x / 0 and x % 0 can't be optimized because they are required to throw an exception.
+
         // x / -1 can't be optimized because INT_MIN / -1 is required to throw an exception.
 
         // x % -1 is always 0 and the IL spec says that the rem instruction "can" throw an exception if x is
@@ -3343,14 +3811,122 @@ GenTree* Lowering::LowerSignedDivOrMod(GenTreePtr node)
 
     if (!isPow2(absDivisorValue))
     {
+#ifdef _TARGET_XARCH_
+        ssize_t magic;
+        int     shift;
+
+        if (type == TYP_INT)
+        {
+            magic = GetSignedMagicNumberForDivide<int32_t>(static_cast<int32_t>(divisorValue), &shift);
+        }
+        else
+        {
+#ifdef _TARGET_64BIT_
+            magic = GetSignedMagicNumberForDivide<int64_t>(static_cast<int64_t>(divisorValue), &shift);
+#else
+            unreached();
+#endif
+        }
+
+        divisor->gtIntConCommon.SetIconValue(magic);
+
+        // Insert a new GT_MULHI node in front of the existing GT_DIV/GT_MOD node.
+        // The existing node will later be transformed into a GT_ADD/GT_SUB that
+        // computes the final result. This way don't need to find and change the
+        // use of the existing node.
+        GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, divisor, dividend);
+        BlockRange().InsertBefore(divMod, mulhi);
+
+        // mulhi was the easy part. Now we need to generate different code depending
+        // on the divisor value:
+        // For 3 we need:
+        //     div = signbit(mulhi) + mulhi
+        // For 5 we need:
+        //     div = signbit(mulhi) + sar(mulhi, 1) ; requires shift adjust
+        // For 7 we need:
+        //     mulhi += dividend                    ; requires add adjust
+        //     div = signbit(mulhi) + sar(mulhi, 2) ; requires shift adjust
+        // For -3 we need:
+        //     mulhi -= dividend                    ; requires sub adjust
+        //     div = signbit(mulhi) + sar(mulhi, 1) ; requires shift adjust
+        bool     requiresAddSubAdjust     = signum(divisorValue) != signum(magic);
+        bool     requiresShiftAdjust      = shift != 0;
+        bool     requiresDividendMultiuse = requiresAddSubAdjust || !isDiv;
+        unsigned curBBWeight              = comp->compCurBB->getBBWeight(comp);
+        unsigned dividendLclNum           = BAD_VAR_NUM;
+
+        if (requiresDividendMultiuse)
+        {
+            LIR::Use dividendUse(BlockRange(), &mulhi->gtOp.gtOp2, mulhi);
+            dividendLclNum = dividendUse.ReplaceWithLclVar(comp, curBBWeight);
+        }
+
+        GenTree* adjusted;
+
+        if (requiresAddSubAdjust)
+        {
+            dividend = comp->gtNewLclvNode(dividendLclNum, type);
+            comp->lvaTable[dividendLclNum].incRefCnts(curBBWeight, comp);
+
+            adjusted = comp->gtNewOperNode(divisorValue > 0 ? GT_ADD : GT_SUB, type, mulhi, dividend);
+            BlockRange().InsertBefore(divMod, dividend, adjusted);
+        }
+        else
+        {
+            adjusted = mulhi;
+        }
+
+        GenTree* shiftBy = comp->gtNewIconNode(genTypeSize(type) * 8 - 1, type);
+        GenTree* signBit = comp->gtNewOperNode(GT_RSZ, type, adjusted, shiftBy);
+        BlockRange().InsertBefore(divMod, shiftBy, signBit);
+
+        LIR::Use adjustedUse(BlockRange(), &signBit->gtOp.gtOp1, signBit);
+        unsigned adjustedLclNum = adjustedUse.ReplaceWithLclVar(comp, curBBWeight);
+        adjusted                = comp->gtNewLclvNode(adjustedLclNum, type);
+        comp->lvaTable[adjustedLclNum].incRefCnts(curBBWeight, comp);
+        BlockRange().InsertBefore(divMod, adjusted);
+
+        if (requiresShiftAdjust)
+        {
+            shiftBy  = comp->gtNewIconNode(shift, TYP_INT);
+            adjusted = comp->gtNewOperNode(GT_RSH, type, adjusted, shiftBy);
+            BlockRange().InsertBefore(divMod, shiftBy, adjusted);
+        }
+
+        if (isDiv)
+        {
+            divMod->SetOperRaw(GT_ADD);
+            divMod->gtOp.gtOp1 = adjusted;
+            divMod->gtOp.gtOp2 = signBit;
+        }
+        else
+        {
+            GenTree* div = comp->gtNewOperNode(GT_ADD, type, adjusted, signBit);
+
+            dividend = comp->gtNewLclvNode(dividendLclNum, type);
+            comp->lvaTable[dividendLclNum].incRefCnts(curBBWeight, comp);
+
+            // divisor % dividend = dividend - divisor x div
+            GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
+            GenTree* mul     = comp->gtNewOperNode(GT_MUL, type, div, divisor);
+            BlockRange().InsertBefore(divMod, dividend, div, divisor, mul);
+
+            divMod->SetOperRaw(GT_SUB);
+            divMod->gtOp.gtOp1 = dividend;
+            divMod->gtOp.gtOp2 = mul;
+        }
+
+        return mulhi;
+#else
+        // Currently there's no GT_MULHI for ARM32/64
         return next;
+#endif
     }
 
-    // We're committed to the conversion now. Go find the use.
+    // We're committed to the conversion now. Go find the use if any.
     LIR::Use use;
     if (!BlockRange().TryGetUse(node, &use))
     {
-        assert(!"signed DIV/MOD node is unused");
         return next;
     }
 
@@ -3450,8 +4026,6 @@ void Lowering::LowerStoreInd(GenTree* node)
 void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
 {
     GenTree* src = blkNode->Data();
-    // TODO-1stClassStructs: Don't require this.
-    assert(blkNode->OperIsInitBlkOp() || !src->OperIsLocal());
     TryCreateAddrMode(LIR::Use(BlockRange(), &blkNode->Addr(), blkNode), false);
 }
 
@@ -3817,17 +4391,17 @@ void Lowering::CheckCallArg(GenTree* arg)
             break;
 #endif
 
-        case GT_LIST:
-            {
-                GenTreeArgList* list = arg->AsArgList();
-                assert(list->IsAggregate());
+        case GT_FIELD_LIST:
+        {
+            GenTreeFieldList* list = arg->AsFieldList();
+            assert(list->IsFieldListHead());
 
-                for (; list != nullptr; list = list->Rest())
-                {
-                    assert(list->Current()->OperIsPutArg());
-                }
+            for (; list != nullptr; list = list->Rest())
+            {
+                assert(list->Current()->OperIsPutArg());
             }
-            break;
+        }
+        break;
 
         default:
             assert(arg->OperIsPutArg());