Imported Upstream version 1.0.0.9127upstream/1.0.0.9127

author: Jiyoung Yun <jy910.yun@samsung.com> 2016-12-27 16:46:08 +0900
committer: Jiyoung Yun <jy910.yun@samsung.com> 2016-12-27 16:46:08 +0900
commit: db20f3f1bb8595633a7e16c8900fd401a453a6b5 (patch)
tree: e5435159cd1bf0519276363a6fe1663d1721bed3 /src/jit/lowerxarch.cpp
parent: 4b4aad7217d3292650e77eec2cf4c198ea9c3b4b (diff)
download: coreclr-db20f3f1bb8595633a7e16c8900fd401a453a6b5.tar.gz
coreclr-db20f3f1bb8595633a7e16c8900fd401a453a6b5.tar.bz2
coreclr-db20f3f1bb8595633a7e16c8900fd401a453a6b5.zip
1 files changed, 857 insertions, 247 deletions
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 6f98eb6661..589cef482e 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -77,7 +77,7 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
             // InitBlk
             MakeSrcContained(storeLoc, op1);
         }
-        else if (storeLoc->TypeGet() == TYP_SIMD12)
+        else if ((storeLoc->TypeGet() == TYP_SIMD12) && (storeLoc->OperGet() == GT_STORE_LCL_FLD))
         {
             // Need an additional register to extract upper 4 bytes of Vector3.
             info->internalFloatCount = 1;
@@ -177,6 +177,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
             break;
 
         case GT_LCL_FLD:
+        case GT_LCL_VAR:
             info->srcCount = 0;
             info->dstCount = 1;
 
@@ -185,9 +186,9 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
             if (tree->TypeGet() == TYP_SIMD12)
             {
                 // We need an internal register different from targetReg in which 'tree' produces its result
-                // because both targetReg and internal reg will be in use at the same time. This is achieved
-                // by asking for two internal registers.
-                info->internalFloatCount = 2;
+                // because both targetReg and internal reg will be in use at the same time.
+                info->internalFloatCount     = 1;
+                info->isInternalRegDelayFree = true;
                 info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
             }
 #endif
@@ -195,7 +196,16 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
 
         case GT_STORE_LCL_FLD:
         case GT_STORE_LCL_VAR:
-            info->srcCount = 1;
+#ifdef _TARGET_X86_
+            if (tree->gtGetOp1()->OperGet() == GT_LONG)
+            {
+                info->srcCount = 2;
+            }
+            else
+#endif // _TARGET_X86_
+            {
+                info->srcCount = 1;
+            }
             info->dstCount = 0;
             LowerStoreLoc(tree->AsLclVarCommon());
             break;
@@ -242,6 +252,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
             break;
 
         case GT_LIST:
+        case GT_FIELD_LIST:
         case GT_ARGPLACE:
         case GT_NO_OP:
         case GT_START_NONGC:
@@ -319,9 +330,87 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
             break;
 
         case GT_JTRUE:
+        {
+            info->srcCount = 0;
+            info->dstCount = 0;
+
+            GenTree* cmp = tree->gtGetOp1();
+            l->clearDstCount(cmp);
+
+#ifdef FEATURE_SIMD
+            // Say we have the following IR
+            //   simdCompareResult = GT_SIMD((In)Equality, v1, v2)
+            //   integerCompareResult = GT_EQ/NE(simdCompareResult, true/false)
+            //   GT_JTRUE(integerCompareResult)
+            //
+            // In this case we don't need to generate code for GT_EQ_/NE, since SIMD (In)Equality
+            // intrinsic would set or clear Zero flag.
+
+            genTreeOps cmpOper = cmp->OperGet();
+            if (cmpOper == GT_EQ || cmpOper == GT_NE)
+            {
+                GenTree* cmpOp1 = cmp->gtGetOp1();
+                GenTree* cmpOp2 = cmp->gtGetOp2();
+
+                if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1)))
+                {
+                    // clear dstCount on SIMD node to indicate that
+                    // result doesn't need to be materialized into a register.
+                    l->clearOperandCounts(cmp);
+                    l->clearDstCount(cmpOp1);
+                    l->clearOperandCounts(cmpOp2);
+
+                    // Codegen of SIMD (in)Equality uses target integer reg
+                    // only for setting flags.  Target reg is not needed on AVX
+                    // when comparing against Vector Zero.  In all other cases
+                    // we need to reserve an int type internal register, since we
+                    // have cleared dstCount.
+                    if (compiler->canUseAVX() && cmpOp1->gtGetOp2()->IsIntegralConstVector(0))
+                    {
+                        // We don't need an internal register,since we use vptest
+                        // for setting flags.
+                    }
+                    else
+                    {
+                        ++(cmpOp1->gtLsraInfo.internalIntCount);
+                        regMaskTP internalCandidates = cmpOp1->gtLsraInfo.getInternalCandidates(l);
+                        internalCandidates |= l->allRegs(TYP_INT);
+                        cmpOp1->gtLsraInfo.setInternalCandidates(l, internalCandidates);
+                    }
+
+                    // We would have to reverse compare oper in the following cases:
+                    // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it.
+                    //    Therefore, if compare oper is == or != against false(0), we will
+                    //    be checking opposite of what is required.
+                    //
+                    // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it.
+                    //    Therefore, if compare oper is == or != against true(1), we will
+                    //    be checking opposite of what is required.
+                    GenTreeSIMD* simdNode = cmpOp1->AsSIMD();
+                    if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality)
+                    {
+                        if (cmpOp2->IsIntegralConst(0))
+                        {
+                            cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+                        }
+                    }
+                    else
+                    {
+                        assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality);
+                        if (cmpOp2->IsIntegralConst(1))
+                        {
+                            cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+                        }
+                    }
+                }
+            }
+#endif // FEATURE_SIMD
+        }
+        break;
+
+        case GT_JCC:
             info->srcCount = 0;
             info->dstCount = 0;
-            l->clearDstCount(tree->gtOp.gtOp1);
             break;
 
         case GT_JMP:
@@ -436,6 +525,9 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
 
         case GT_MUL:
         case GT_MULHI:
+#if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
+        case GT_MUL_LONG:
+#endif
             SetMulOpCounts(tree);
             break;
 
@@ -478,6 +570,11 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
                 info->internalFloatCount = 1;
                 info->setInternalCandidates(l, l->internalFloatRegCandidates());
             }
+            else
+            {
+                // Codegen of this tree node sets ZF and SF flags.
+                tree->gtFlags |= GTF_ZSF_SET;
+            }
             break;
 
         case GT_NOT:
@@ -490,6 +587,10 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
         case GT_RSZ:
         case GT_ROL:
         case GT_ROR:
+#ifdef _TARGET_X86_
+        case GT_LSH_HI:
+        case GT_RSH_LO:
+#endif
             TreeNodeInfoInitShiftRotate(tree);
             break;
 
@@ -499,7 +600,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
         case GT_LE:
         case GT_GE:
         case GT_GT:
-            LowerCmp(tree);
+            TreeNodeInfoInitCmp(tree);
             break;
 
         case GT_CKFINITE:
@@ -542,10 +643,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
         }
         break;
 
-#ifdef _TARGET_X86_
-        case GT_OBJ:
-            NYI_X86("GT_OBJ");
-#elif !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#if !defined(FEATURE_PUT_STRUCT_ARG_STK)
         case GT_OBJ:
 #endif
         case GT_BLK:
@@ -556,11 +654,11 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
             info->dstCount = 0;
             break;
 
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
         case GT_PUTARG_STK:
-            TreeNodeInfoInitPutArgStk(tree);
+            TreeNodeInfoInitPutArgStk(tree->AsPutArgStk());
             break;
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+#endif // FEATURE_PUT_STRUCT_ARG_STK
 
         case GT_STORE_BLK:
         case GT_STORE_OBJ:
@@ -568,6 +666,12 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
             TreeNodeInfoInitBlockStore(tree->AsBlk());
             break;
 
+        case GT_INIT_VAL:
+            // Always a passthrough of its child's value.
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
         case GT_LCLHEAP:
             TreeNodeInfoInitLclHeap(tree);
             break;
@@ -634,14 +738,20 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
         case GT_ARR_OFFSET:
             // This consumes the offset, if any, the arrObj and the effective index,
             // and produces the flattened offset for this dimension.
-            info->srcCount         = 3;
-            info->dstCount         = 1;
-            info->internalIntCount = 1;
+            info->srcCount = 3;
+            info->dstCount = 1;
+
             // we don't want to generate code for this
             if (tree->gtArrOffs.gtOffset->IsIntegralConst(0))
             {
                 MakeSrcContained(tree, tree->gtArrOffs.gtOffset);
             }
+            else
+            {
+                // Here we simply need an internal register, which must be different
+                // from any of the operand's registers, but may be the same as targetReg.
+                info->internalIntCount = 1;
+            }
             break;
 
         case GT_LEA:
@@ -725,15 +835,9 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
 #endif
 
         case GT_CLS_VAR:
-            info->srcCount = 0;
-            // GT_CLS_VAR, by the time we reach the backend, must always
-            // be a pure use.
-            // It will produce a result of the type of the
-            // node, and use an internal register for the address.
-
-            info->dstCount = 1;
-            assert((tree->gtFlags & (GTF_VAR_DEF | GTF_VAR_USEASG | GTF_VAR_USEDEF)) == 0);
-            info->internalIntCount = 1;
+            // These nodes are eliminated by rationalizer.
+            JITDUMP("Unexpected node %s in Lower.\n", GenTree::NodeName(tree->OperGet()));
+            unreached();
             break;
     } // end switch (tree->OperGet())
 
@@ -813,27 +917,36 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
         }
     }
 
+    TreeNodeInfoInitCheckByteable(tree);
+
+    // We need to be sure that we've set info->srcCount and info->dstCount appropriately
+    assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCheckByteable: Check the tree to see if "byte-able" registers are
+// required, and set the tree node info accordingly.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCheckByteable(GenTree* tree)
+{
 #ifdef _TARGET_X86_
+    LinearScan*   l    = m_lsra;
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+
     // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
     // if the tree node is a byte type.
     //
-    // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
-    // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
-    // value. In this case we need to exclude esi/edi from the src candidates of op2.
-    //
-    // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
-    //
-    // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
-    // ubyte as the result of comparison and if the result needs to be materialized into a reg
-    // simply zero extend it to TYP_INT size.  Here is an example of generated code:
-    //         cmp dl, byte ptr[addr mode]
-    //         movzx edx, dl
-    //
     // Though this looks conservative in theory, in practice we could not think of a case where
     // the below logic leads to conservative register specification.  In future when or if we find
     // one such case, this logic needs to be fine tuned for that case(s).
-    if (varTypeIsByte(tree) || ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType())) ||
-        (tree->OperIsCompare() && varTypeIsByte(tree->gtGetOp1()) && varTypeIsByte(tree->gtGetOp2())))
+
+    if (ExcludeNonByteableRegisters(tree))
     {
         regMaskTP regMask;
         if (info->dstCount > 0)
@@ -870,9 +983,6 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
         }
     }
 #endif //_TARGET_X86_
-
-    // We need to be sure that we've set info->srcCount and info->dstCount appropriately
-    assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
 }
 
 //------------------------------------------------------------------------
@@ -1028,6 +1138,31 @@ void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree)
     GenTreePtr shiftBy = tree->gtOp.gtOp2;
     GenTreePtr source  = tree->gtOp.gtOp1;
 
+#ifdef _TARGET_X86_
+    // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
+    // we can have a three operand form. Increment the srcCount.
+    if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
+    {
+        assert(source->OperGet() == GT_LONG);
+
+        info->srcCount++;
+
+        if (tree->OperGet() == GT_LSH_HI)
+        {
+            GenTreePtr sourceLo              = source->gtOp.gtOp1;
+            sourceLo->gtLsraInfo.isDelayFree = true;
+        }
+        else
+        {
+            GenTreePtr sourceHi              = source->gtOp.gtOp2;
+            sourceHi->gtLsraInfo.isDelayFree = true;
+        }
+
+        source->gtLsraInfo.hasDelayFreeSrc = true;
+        info->hasDelayFreeSrc              = true;
+    }
+#endif
+
     // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
     // We will allow whatever can be encoded - hope you know what you are doing.
     if (!IsContainableImmed(tree, shiftBy) || (shiftBy->gtIntConCommon.IconValue() > 255) ||
@@ -1040,6 +1175,17 @@ void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree)
     else
     {
         MakeSrcContained(tree, shiftBy);
+
+        // Note that Rotate Left/Right instructions don't set ZF and SF flags.
+        //
+        // If the operand being shifted is 32-bits then upper three bits are masked
+        // by hardware to get actual shift count.  Similarly for 64-bit operands
+        // shift count is narrowed to [0..63].  If the resulting shift count is zero,
+        // then shift operation won't modify flags.
+        //
+        // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
+        // if the shift count is known to be non-zero and in the range depending on the
+        // operand size.
     }
 }
 
@@ -1088,6 +1234,12 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
         assert(ctrlExpr == nullptr);
         assert(call->gtCallAddr != nullptr);
         ctrlExpr = call->gtCallAddr;
+
+#ifdef _TARGET_X86_
+        // Fast tail calls aren't currently supported on x86, but if they ever are, the code
+        // below that handles indirect VSD calls will need to be fixed.
+        assert(!call->IsFastTailCall() || !call->IsVirtualStub());
+#endif // _TARGET_X86_
     }
 
     // set reg requirements on call target represented as control sequence.
@@ -1103,7 +1255,24 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
         // computed into a register.
         if (!call->IsFastTailCall())
         {
-            if (ctrlExpr->isIndir())
+#ifdef _TARGET_X86_
+            // On x86, we need to generate a very specific pattern for indirect VSD calls:
+            //
+            //    3-byte nop
+            //    call dword ptr [eax]
+            //
+            // Where EAX is also used as an argument to the stub dispatch helper. Make
+            // sure that the call target address is computed into EAX in this case.
+            if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
+            {
+                assert(ctrlExpr->isIndir());
+
+                ctrlExpr->gtGetOp1()->gtLsraInfo.setSrcCandidates(l, RBM_VIRTUAL_STUB_TARGET);
+                MakeSrcContained(call, ctrlExpr);
+            }
+            else
+#endif // _TARGET_X86_
+                if (ctrlExpr->isIndir())
             {
                 MakeSrcContained(call, ctrlExpr);
             }
@@ -1191,7 +1360,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
     // First, count reg args
     for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
     {
-        assert(list->IsList());
+        assert(list->OperIsList());
 
         GenTreePtr argNode = list->Current();
 
@@ -1206,7 +1375,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
             argNode->gtLsraInfo.srcCount = 1;
             argNode->gtLsraInfo.dstCount = 0;
 
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
             // If the node is TYP_STRUCT and it is put on stack with
             // putarg_stk operation, we consume and produce no registers.
             // In this case the embedded Obj node should not produce
@@ -1218,7 +1387,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
                 argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0;
                 argNode->gtLsraInfo.srcCount             = 0;
             }
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+#endif // FEATURE_PUT_STRUCT_ARG_STK
             continue;
         }
 
@@ -1248,7 +1417,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
 
         // If the struct arg is wrapped in CPYBLK the type of the param will be TYP_VOID.
         // Use the curArgTabEntry's isStruct to get whether the param is a struct.
-        if (varTypeIsStruct(argNode) FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(|| curArgTabEntry->isStruct))
+        if (varTypeIsStruct(argNode) PUT_STRUCT_ARG_STK_ONLY(|| curArgTabEntry->isStruct))
         {
             unsigned   originalSize = 0;
             LclVarDsc* varDsc       = nullptr;
@@ -1270,16 +1439,16 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
             {
                 originalSize = genTypeSize(argNode->gtType);
             }
-            else if (argNode->gtOper == GT_LIST)
+            else if (argNode->gtOper == GT_FIELD_LIST)
             {
                 originalSize = 0;
 
                 // There could be up to 2 PUTARG_REGs in the list
-                GenTreeArgList* argListPtr   = argNode->AsArgList();
-                unsigned        iterationNum = 0;
-                for (; argListPtr; argListPtr = argListPtr->Rest())
+                GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
+                unsigned          iterationNum = 0;
+                for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest())
                 {
-                    GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1;
+                    GenTreePtr putArgRegNode = fieldListPtr->Current();
                     assert(putArgRegNode->gtOper == GT_PUTARG_REG);
 
                     if (iterationNum == 0)
@@ -1509,7 +1678,7 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
             }
             m_lsra->clearOperandCounts(source);
         }
-        else if (!source->OperIsSIMD())
+        else if (!source->IsMultiRegCall() && !source->OperIsSIMD())
         {
             assert(source->IsLocal());
             MakeSrcContained(blkNode, source);
@@ -1519,7 +1688,11 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
     if (isInitBlk)
     {
         GenTree* initVal = source;
-        srcAddrOrFill    = source;
+        if (initVal->OperIsInitVal())
+        {
+            initVal = initVal->gtGetOp1();
+        }
+        srcAddrOrFill = initVal;
         // If we have an InitBlk with constant block size we can optimize several ways:
         // a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes
         //    we use rep stosb since this reduces the register pressure in LSRA and we have
@@ -1571,8 +1744,23 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
                     // a pack of 16 init value constants.
                     blkNode->gtLsraInfo.internalFloatCount = 1;
                     blkNode->gtLsraInfo.setInternalCandidates(l, l->internalFloatRegCandidates());
+                    if ((fill == 0) && ((size & 0xf) == 0))
+                    {
+                        MakeSrcContained(blkNode, source);
+                    }
                 }
                 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
+
+#ifdef _TARGET_X86_
+                if ((size & 1) != 0)
+                {
+                    // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
+                    // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
+                    // when unrolling, so only allow byteable registers as the source value. (We could
+                    // consider just using BlkOpKindRepInstr instead.)
+                    sourceRegMask = RBM_BYTE_REGS;
+                }
+#endif // _TARGET_X86_
             }
             else
             {
@@ -1825,7 +2013,7 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
     }
 }
 
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
 //------------------------------------------------------------------------
 // TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
 //
@@ -1835,44 +2023,219 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
 // Return Value:
 //    None.
 //
-void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree)
+void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
 {
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
+    TreeNodeInfo* info = &(putArgStk->gtLsraInfo);
     LinearScan*   l    = m_lsra;
 
-    if (tree->TypeGet() != TYP_STRUCT)
+#ifdef _TARGET_X86_
+    if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
+    {
+        putArgStk->gtNumberReferenceSlots = 0;
+        putArgStk->gtPutArgStkKind        = GenTreePutArgStk::Kind::Invalid;
+
+        GenTreeFieldList* fieldList = putArgStk->gtOp1->AsFieldList();
+
+        // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
+        // of uses is visible to LSRA.
+        unsigned          fieldCount = 0;
+        GenTreeFieldList* head       = nullptr;
+        for (GenTreeFieldList *current = fieldList, *next; current != nullptr; current = next)
+        {
+            next = current->Rest();
+
+            // First, insert the field node into the sorted list.
+            GenTreeFieldList* prev = nullptr;
+            for (GenTreeFieldList* cursor = head;; cursor = cursor->Rest())
+            {
+                // If the offset of the current list node is greater than the offset of the cursor or if we have
+                // reached the end of the list, insert the current node before the cursor and terminate.
+                if ((cursor == nullptr) || (current->gtFieldOffset > cursor->gtFieldOffset))
+                {
+                    if (prev == nullptr)
+                    {
+                        assert(cursor == head);
+                        head = current;
+                    }
+                    else
+                    {
+                        prev->Rest() = current;
+                    }
+
+                    current->Rest() = cursor;
+                    break;
+                }
+            }
+
+            fieldCount++;
+        }
+
+        info->srcCount = fieldCount;
+        info->dstCount = 0;
+
+        // In theory, the upper bound for the size of a field list is 8: these constructs only appear when passing the
+        // collection of lclVars that represent the fields of a promoted struct lclVar, and we do not promote struct
+        // lclVars with more than 4 fields. If each of these lclVars is of type long, decomposition will split the
+        // corresponding field list nodes in two, giving an upper bound of 8.
+        //
+        // The reason that this is important is that the algorithm we use above to sort the field list is O(N^2): if
+        // the maximum size of a field list grows significantly, we will need to reevaluate it.
+        assert(fieldCount <= 8);
+
+        // The sort above may have changed which node is at the head of the list. Update the PUTARG_STK node if
+        // necessary.
+        if (head != fieldList)
+        {
+            head->gtFlags |= GTF_FIELD_LIST_HEAD;
+            fieldList->gtFlags &= ~GTF_FIELD_LIST_HEAD;
+
+#ifdef DEBUG
+            head->gtSeqNum = fieldList->gtSeqNum;
+#endif // DEBUG
+
+            head->gtLsraInfo = fieldList->gtLsraInfo;
+            head->gtClearReg(comp);
+
+            BlockRange().InsertAfter(fieldList, head);
+            BlockRange().Remove(fieldList);
+
+            fieldList        = head;
+            putArgStk->gtOp1 = fieldList;
+        }
+
+        // Now that the fields have been sorted, initialize the LSRA info.
+        bool     allFieldsAreSlots = true;
+        bool     needsByteTemp     = false;
+        unsigned prevOffset        = putArgStk->getArgSize();
+        for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
+        {
+            GenTree* const  fieldNode   = current->Current();
+            const var_types fieldType   = fieldNode->TypeGet();
+            const unsigned  fieldOffset = current->gtFieldOffset;
+            assert(fieldType != TYP_LONG);
+
+            // For x86 we must mark all integral fields as contained or reg-optional, and handle them
+            // accordingly in code generation, since we may have up to 8 fields, which cannot all be in
+            // registers to be consumed atomically by the call.
+            if (varTypeIsIntegralOrI(fieldNode))
+            {
+                if (fieldNode->OperGet() == GT_LCL_VAR)
+                {
+                    LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]);
+                    if (varDsc->lvTracked && !varDsc->lvDoNotEnregister)
+                    {
+                        SetRegOptional(fieldNode);
+                    }
+                    else
+                    {
+                        MakeSrcContained(putArgStk, fieldNode);
+                    }
+                }
+                else if (fieldNode->IsIntCnsFitsInI32())
+                {
+                    MakeSrcContained(putArgStk, fieldNode);
+                }
+                else
+                {
+                    // For the case where we cannot directly push the value, if we run out of registers,
+                    // it would be better to defer computation until we are pushing the arguments rather
+                    // than spilling, but this situation is not all that common, as most cases of promoted
+                    // structs do not have a large number of fields, and of those most are lclVars or
+                    // copy-propagated constants.
+                    SetRegOptional(fieldNode);
+                }
+            }
+            else
+            {
+                assert(varTypeIsFloating(fieldNode));
+            }
+
+            // We can treat as a slot any field that is stored at a slot boundary, where the previous
+            // field is not in the same slot. (Note that we store the fields in reverse order.)
+            const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
+            if (!fieldIsSlot)
+            {
+                allFieldsAreSlots = false;
+                if (varTypeIsByte(fieldType))
+                {
+                    // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
+                    // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
+                    // need a byte-addressable register for the store. We will enforce this requirement on an internal
+                    // register, which we can use to copy multiple byte values.
+                    needsByteTemp = true;
+                }
+            }
+
+            if (varTypeIsGC(fieldType))
+            {
+                putArgStk->gtNumberReferenceSlots++;
+            }
+
+            prevOffset = fieldOffset;
+        }
+
+        // Set the copy kind.
+        // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should
+        // adjust the stack once for those fields. The latter is really best done in code generation, but
+        // this tuning should probably be undertaken as a whole.
+        // Also, if there are  floating point fields, it may be better to use the "Unroll" mode
+        // of copying the struct as a whole, if the fields are not register candidates.
+        if (allFieldsAreSlots)
+        {
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots;
+        }
+        else
+        {
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
+            // If any of the fields cannot be stored with an actual push, we may need a temporary
+            // register to load the value before storing it to the stack location.
+            info->internalIntCount = 1;
+            regMaskTP regMask      = l->allRegs(TYP_INT);
+            if (needsByteTemp)
+            {
+                regMask &= ~RBM_NON_BYTE_REGS;
+            }
+            info->setInternalCandidates(l, regMask);
+        }
+        return;
+    }
+#endif // _TARGET_X86_
+
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+    // For PutArgStk of a TYP_SIMD12, we need an extra register.
+    if (putArgStk->TypeGet() == TYP_SIMD12)
     {
-        TreeNodeInfoInitSimple(tree);
+        info->srcCount           = putArgStk->gtOp1->gtLsraInfo.dstCount;
+        info->dstCount           = 0;
+        info->internalFloatCount = 1;
+        info->setInternalCandidates(l, l->allSIMDRegs());
         return;
     }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
 
-    GenTreePutArgStk* putArgStkTree = tree->AsPutArgStk();
+    if (putArgStk->TypeGet() != TYP_STRUCT)
+    {
+        TreeNodeInfoInitSimple(putArgStk);
+        return;
+    }
 
-    GenTreePtr dst     = tree;
-    GenTreePtr src     = tree->gtOp.gtOp1;
+    GenTreePtr dst     = putArgStk;
+    GenTreePtr src     = putArgStk->gtOp1;
     GenTreePtr srcAddr = nullptr;
 
+    bool haveLocalAddr = false;
     if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
     {
         srcAddr = src->gtOp.gtOp1;
+        assert(srcAddr != nullptr);
+        haveLocalAddr = srcAddr->OperIsLocalAddr();
     }
     else
     {
-        assert(varTypeIsSIMD(tree));
-    }
-    info->srcCount = src->gtLsraInfo.dstCount;
-
-    // If this is a stack variable address,
-    // make the op1 contained, so this way
-    // there is no unnecessary copying between registers.
-    // To avoid assertion, increment the parent's source.
-    // It is recovered below.
-    bool haveLocalAddr = ((srcAddr != nullptr) && (srcAddr->OperIsLocalAddr()));
-    if (haveLocalAddr)
-    {
-        info->srcCount += 1;
+        assert(varTypeIsSIMD(putArgStk));
     }
 
+    info->srcCount = src->gtLsraInfo.dstCount;
     info->dstCount = 0;
 
     // In case of a CpBlk we could use a helper call. In case of putarg_stk we
@@ -1884,7 +2247,7 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree)
     // This threshold will decide from using the helper or let the JIT decide to inline
     // a code sequence of its choice.
     ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
-    ssize_t size            = putArgStkTree->gtNumSlots * TARGET_POINTER_SIZE;
+    ssize_t size            = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
 
     // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
     // (I don't know which).
@@ -1892,7 +2255,7 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree)
     // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
     // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
     // our framework assemblies, so this is the main code generation scheme we'll use.
-    if (size <= CPBLK_UNROLL_LIMIT && putArgStkTree->gtNumberReferenceSlots == 0)
+    if (size <= CPBLK_UNROLL_LIMIT && putArgStk->gtNumberReferenceSlots == 0)
     {
         // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
         //
@@ -1913,46 +2276,62 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree)
             info->setInternalCandidates(l, regMask);
         }
 
+#ifdef _TARGET_X86_
+        if (size >= 8)
+#else  // !_TARGET_X86_
         if (size >= XMM_REGSIZE_BYTES)
+#endif // !_TARGET_X86_
         {
-            // If we have a buffer larger than XMM_REGSIZE_BYTES,
-            // reserve an XMM register to use it for a
+            // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
+            // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
             // series of 16-byte loads and stores.
             info->internalFloatCount = 1;
             info->addInternalCandidates(l, l->internalFloatRegCandidates());
         }
 
-        if (haveLocalAddr)
+#ifdef _TARGET_X86_
+        if (size < XMM_REGSIZE_BYTES)
         {
-            MakeSrcContained(putArgStkTree, srcAddr);
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
         }
-
-        // If src or dst are on stack, we don't have to generate the address into a register
-        // because it's just some constant+SP
-        putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindUnroll;
+        else
+#endif // _TARGET_X86_
+        {
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
+        }
+    }
+#ifdef _TARGET_X86_
+    else if (putArgStk->gtNumberReferenceSlots != 0)
+    {
+        // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
+        // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
+        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
     }
+#endif // _TARGET_X86_
     else
     {
         info->internalIntCount += 3;
         info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI));
-        if (haveLocalAddr)
-        {
-            MakeSrcContained(putArgStkTree, srcAddr);
-        }
 
-        putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindRepInstr;
+        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
     }
 
     // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
-    MakeSrcContained(putArgStkTree, src);
+    MakeSrcContained(putArgStk, src);
 
-    // Balance up the inc above.
     if (haveLocalAddr)
     {
-        info->srcCount -= 1;
+        // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
+        // copies.
+        //
+        // To avoid an assertion in MakeSrcContained, increment the parent's source count beforehand and decrement it
+        // afterwards.
+        info->srcCount++;
+        MakeSrcContained(putArgStk, srcAddr);
+        info->srcCount--;
     }
 }
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+#endif // FEATURE_PUT_STRUCT_ARG_STK
 
 //------------------------------------------------------------------------
 // TreeNodeInfoInitLclHeap: Set the NodeInfo for a GT_LCLHEAP.
@@ -1976,13 +2355,17 @@ void Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
     // Here '-' means don't care.
     //
     //     Size?                    Init Memory?         # temp regs
-    //      0                            -                  0
-    //      const and <=6 reg words      -                  0
-    //      const and >6 reg words       Yes                0
+    //      0                            -                  0 (returns 0)
+    //      const and <=6 reg words      -                  0 (pushes '0')
+    //      const and >6 reg words       Yes                0 (pushes '0')
     //      const and <PageSize          No                 0 (amd64) 1 (x86)
-    //      const and >=PageSize         No                 2
-    //      Non-const                    Yes                0
-    //      Non-const                    No                 2
+    //                                                        (x86:tmpReg for sutracting from esp)
+    //      const and >=PageSize         No                 2 (regCnt and tmpReg for subtracing from sp)
+    //      Non-const                    Yes                0 (regCnt=targetReg and pushes '0')
+    //      Non-const                    No                 2 (regCnt and tmpReg for subtracting from sp)
+    //
+    // Note: Here we don't need internal register to be different from targetReg.
+    // Rather, require it to be different from operand's reg.
 
     GenTreePtr size = tree->gtOp.gtOp1;
     if (size->IsCnsIntOrI())
@@ -2121,6 +2504,9 @@ void Lowering::TreeNodeInfoInitLogicalOp(GenTree* tree)
         // as reg optional.
         SetRegOptionalForBinOp(tree);
     }
+
+    // Codegen of this tree node sets ZF and SF flags.
+    tree->gtFlags |= GTF_ZSF_SET;
 }
 
 //------------------------------------------------------------------------
@@ -2189,15 +2575,40 @@ void Lowering::TreeNodeInfoInitModDiv(GenTree* tree)
         info->setDstCandidates(l, RBM_RAX);
     }
 
-    // If possible would like to have op1 in RAX to avoid a register move
-    op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+    bool op2CanBeRegOptional = true;
+#ifdef _TARGET_X86_
+    if (op1->OperGet() == GT_LONG)
+    {
+        // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
+        GenTree* loVal = op1->gtGetOp1();
+        GenTree* hiVal = op1->gtGetOp2();
+
+        // Src count is actually 3, so increment.
+        assert(op2->IsCnsIntOrI());
+        assert(tree->OperGet() == GT_UMOD);
+        info->srcCount++;
+        op2CanBeRegOptional = false;
+
+        // This situation also requires an internal register.
+        info->internalIntCount = 1;
+        info->setInternalCandidates(l, l->allRegs(TYP_INT));
+
+        loVal->gtLsraInfo.setSrcCandidates(l, RBM_EAX);
+        hiVal->gtLsraInfo.setSrcCandidates(l, RBM_EDX);
+    }
+    else
+#endif
+    {
+        // If possible would like to have op1 in RAX to avoid a register move
+        op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+    }
 
     // divisor can be an r/m, but the memory indirection must be of the same size as the divide
     if (op2->isMemoryOp() && (op2->TypeGet() == tree->TypeGet()))
     {
         MakeSrcContained(tree, op2);
     }
-    else
+    else if (op2CanBeRegOptional)
     {
         op2->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
 
@@ -2298,12 +2709,13 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
     info->dstCount         = 1;
     switch (simdTree->gtSIMDIntrinsicID)
     {
+        GenTree* op1;
         GenTree* op2;
 
         case SIMDIntrinsicInit:
         {
             info->srcCount = 1;
-            GenTree* op1   = tree->gtOp.gtOp1;
+            op1            = tree->gtOp.gtOp1;
 
             // This sets all fields of a SIMD struct to the given value.
             // Mark op1 as contained if it is either zero or int constant of all 1's,
@@ -2377,7 +2789,8 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
             info->srcCount = 2;
 
             // SSE2 32-bit integer multiplication requires two temp regs
-            if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT)
+            if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
+                comp->getSIMDInstructionSet() == InstructionSet_SSE2)
             {
                 info->internalFloatCount = 2;
                 info->setInternalCandidates(lsra, lsra->allSIMDRegs());
@@ -2406,38 +2819,78 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
 
         case SIMDIntrinsicOpEquality:
         case SIMDIntrinsicOpInEquality:
-            // Need two SIMD registers as scratch.
-            // See genSIMDIntrinsicRelOp() for details on code sequence generate and
-            // the need for two scratch registers.
-            info->srcCount           = 2;
-            info->internalFloatCount = 2;
-            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            info->srcCount = 2;
+
+            // On SSE4/AVX, we can generate optimal code for (in)equality
+            // against zero using ptest. We can safely do the this optimization
+            // for integral vectors but not for floating-point for the reason
+            // that we have +0.0 and -0.0 and +0.0 == -0.0
+            op2 = tree->gtGetOp2();
+            if ((comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0))
+            {
+                MakeSrcContained(tree, op2);
+            }
+            else
+            {
+
+                // Need one SIMD register as scratch.
+                // See genSIMDIntrinsicRelOp() for details on code sequence generated and
+                // the need for one scratch register.
+                //
+                // Note these intrinsics produce a BOOL result, hence internal float
+                // registers reserved are guaranteed to be different from target
+                // integer register without explicitly specifying.
+                info->internalFloatCount = 1;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            }
             break;
 
         case SIMDIntrinsicDotProduct:
-            if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) ||
-                (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
+            // Float/Double vectors:
+            // For SSE, or AVX with 32-byte vectors, we also need an internal register
+            // as scratch. Further we need the targetReg and internal reg to be distinct
+            // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
+            // don't need a tmpReg.
+            //
+            // 32-byte integer vector on SSE4/AVX:
+            // will take advantage of phaddd, which operates only on 128-bit xmm reg.
+            // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
+            // registers since targetReg is an int type register.
+            //
+            // See genSIMDIntrinsicDotProduct() for details on code sequence generated
+            // and the need for scratch registers.
+            if (varTypeIsFloating(simdTree->gtSIMDBaseType))
             {
-                // For SSE, or AVX with 32-byte vectors, we also need an internal register as scratch.
-                // Further we need the targetReg and internal reg to be distinct registers.
-                // This is achieved by requesting two internal registers; thus one of them
-                // will be different from targetReg.
-                // Note that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
-                //
-                // See genSIMDIntrinsicDotProduct() for details on code sequence generated and
-                // the need for scratch registers.
-                info->internalFloatCount = 2;
+                if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) ||
+                    (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
+                {
+                    info->internalFloatCount     = 1;
+                    info->isInternalRegDelayFree = true;
+                    info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+                }
+                // else don't need scratch reg(s).
+            }
+            else
+            {
+                assert(simdTree->gtSIMDBaseType == TYP_INT && comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4);
+
+                // No need to set isInternalRegDelayFree since targetReg is a
+                // an int type reg and guaranteed to be different from xmm/ymm
+                // regs.
+                info->internalFloatCount = comp->canUseAVX() ? 2 : 1;
                 info->setInternalCandidates(lsra, lsra->allSIMDRegs());
             }
             info->srcCount = 2;
             break;
 
         case SIMDIntrinsicGetItem:
+        {
             // This implements get_Item method. The sources are:
             //  - the source SIMD struct
             //  - index (which element to get)
             // The result is baseType of SIMD struct.
             info->srcCount = 2;
+            op1            = tree->gtOp.gtOp1;
             op2            = tree->gtOp.gtOp2;
 
             // If the index is a constant, mark it as contained.
@@ -2446,48 +2899,69 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
                 info->srcCount = 1;
             }
 
-            // If the index is not a constant, we will use the SIMD temp location to store the vector.
-            // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
-            // can use that in the process of extracting the element.
-            //
-            // If the index is a constant and base type is a small int we can use pextrw, but on AVX
-            // we will need a temp if are indexing into the upper half of the AVX register.
-            // In all other cases with constant index, we need a temp xmm register to extract the
-            // element if index is other than zero.
-
-            if (!op2->IsCnsIntOrI())
+            if (op1->isMemoryOp())
             {
-                (void)comp->getSIMDInitTempVarNum();
+                MakeSrcContained(tree, op1);
+
+                // Although GT_IND of TYP_SIMD12 reserves an internal float
+                // register for reading 4 and 8 bytes from memory and
+                // assembling them into target XMM reg, it is not required
+                // in this case.
+                op1->gtLsraInfo.internalIntCount   = 0;
+                op1->gtLsraInfo.internalFloatCount = 0;
             }
-            else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
+            else
             {
-                bool needFloatTemp;
-                if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
-                    (comp->getSIMDInstructionSet() == InstructionSet_AVX))
-                {
-                    int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
-                    needFloatTemp    = (byteShiftCnt >= 16);
-                }
-                else
+                // If the index is not a constant, we will use the SIMD temp location to store the vector.
+                // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
+                // can use that in the process of extracting the element.
+                //
+                // If the index is a constant and base type is a small int we can use pextrw, but on AVX
+                // we will need a temp if are indexing into the upper half of the AVX register.
+                // In all other cases with constant index, we need a temp xmm register to extract the
+                // element if index is other than zero.
+
+                if (!op2->IsCnsIntOrI())
                 {
-                    needFloatTemp = !op2->IsIntegralConst(0);
+                    (void)comp->getSIMDInitTempVarNum();
                 }
-                if (needFloatTemp)
+                else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
                 {
-                    info->internalFloatCount = 1;
-                    info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+                    bool needFloatTemp;
+                    if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
+                        (comp->getSIMDInstructionSet() == InstructionSet_AVX))
+                    {
+                        int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
+                        needFloatTemp    = (byteShiftCnt >= 16);
+                    }
+                    else
+                    {
+                        needFloatTemp = !op2->IsIntegralConst(0);
+                    }
+
+                    if (needFloatTemp)
+                    {
+                        info->internalFloatCount = 1;
+                        info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+                    }
                 }
             }
-            break;
+        }
+        break;
 
         case SIMDIntrinsicSetX:
         case SIMDIntrinsicSetY:
         case SIMDIntrinsicSetZ:
         case SIMDIntrinsicSetW:
-            // We need an internal integer register
-            info->srcCount         = 2;
-            info->internalIntCount = 1;
-            info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT));
+            info->srcCount = 2;
+
+            // We need an internal integer register for SSE2 codegen
+            if (comp->getSIMDInstructionSet() == InstructionSet_SSE2)
+            {
+                info->internalIntCount = 1;
+                info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT));
+            }
+
             break;
 
         case SIMDIntrinsicCast:
@@ -2592,6 +3066,8 @@ void Lowering::TreeNodeInfoInitCast(GenTree* tree)
     {
         if (genTypeSize(castOpType) == 8)
         {
+            // Here we don't need internal register to be different from targetReg,
+            // rather require it to be different from operand's reg.
             info->internalIntCount = 1;
         }
     }
@@ -2693,7 +3169,6 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
     GenTreePtr index = nullptr;
     unsigned   mul, cns;
     bool       rev;
-    bool       modifiedSources = false;
 
 #ifdef FEATURE_SIMD
     // If indirTree is of TYP_SIMD12, don't mark addr as contained
@@ -2711,11 +3186,10 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
         info->internalFloatCount = 1;
 
         // In case of GT_IND we need an internal register different from targetReg and
-        // both of the registers are used at the same time. This achieved by reserving
-        // two internal registers
+        // both of the registers are used at the same time.
         if (indirTree->OperGet() == GT_IND)
         {
-            (info->internalFloatCount)++;
+            info->isInternalRegDelayFree = true;
         }
 
         info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
@@ -2724,16 +3198,21 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
     }
 #endif // FEATURE_SIMD
 
-    // These nodes go into an addr mode:
-    // - GT_CLS_VAR_ADDR turns into a constant.
-    // - GT_LCL_VAR_ADDR is a stack addr mode.
-    if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
+    if ((indirTree->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0)
     {
+        // The address of an indirection that requires its address in a reg.
+        // Skip any further processing that might otherwise make it contained.
+    }
+    else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
+    {
+        // These nodes go into an addr mode:
+        // - GT_CLS_VAR_ADDR turns into a constant.
+        // - GT_LCL_VAR_ADDR is a stack addr mode.
+
         // make this contained, it turns into a constant that goes into an addr mode
         MakeSrcContained(indirTree, addr);
     }
-    else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp) &&
-             addr->gtLsraInfo.getDstCandidates(m_lsra) != RBM_VIRTUAL_STUB_PARAM)
+    else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))
     {
         // Amd64:
         // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
@@ -2755,17 +3234,10 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
     }
     else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr))
     {
-        GenTreeAddrMode* lea = addr->AsAddrMode();
-        base                 = lea->Base();
-        index                = lea->Index();
-
-        m_lsra->clearOperandCounts(addr);
-        // The srcCount is decremented because addr is now "contained",
-        // then we account for the base and index below, if they are non-null.
-        info->srcCount--;
+        MakeSrcContained(indirTree, addr);
     }
     else if (comp->codeGen->genCreateAddrMode(addr, -1, true, 0, &rev, &base, &index, &mul, &cns, true /*nogen*/) &&
-             !(modifiedSources = AreSourcesPossiblyModifiedLocals(indirTree, base, index)))
+             !AreSourcesPossiblyModifiedLocals(indirTree, base, index))
     {
         // An addressing mode will be constructed that may cause some
         // nodes to not need a register, and cause others' lifetimes to be extended
@@ -2774,7 +3246,16 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
         assert(base != addr);
         m_lsra->clearOperandCounts(addr);
 
-        GenTreePtr arrLength = nullptr;
+        const bool hasBase  = base != nullptr;
+        const bool hasIndex = index != nullptr;
+        assert(hasBase || hasIndex); // At least one of a base or an index must be present.
+
+        // If the addressing mode has both a base and an index, bump its source count by one. If it only has one or the
+        // other, its source count is already correct (due to the source for the address itself).
+        if (hasBase && hasIndex)
+        {
+            info->srcCount++;
+        }
 
         // Traverse the computation below GT_IND to find the operands
         // for the addressing mode, marking the various constants and
@@ -2784,14 +3265,13 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
         // up of simple arithmetic operators, and the code generator
         // only traverses one leg of each node.
 
-        bool       foundBase  = (base == nullptr);
-        bool       foundIndex = (index == nullptr);
-        GenTreePtr nextChild  = nullptr;
-        for (GenTreePtr child = addr; child != nullptr && !child->OperIsLeaf(); child = nextChild)
+        bool foundBase  = !hasBase;
+        bool foundIndex = !hasIndex;
+        for (GenTree *child = addr, *nextChild = nullptr; child != nullptr && !child->OperIsLeaf(); child = nextChild)
         {
-            nextChild      = nullptr;
-            GenTreePtr op1 = child->gtOp.gtOp1;
-            GenTreePtr op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr;
+            nextChild    = nullptr;
+            GenTree* op1 = child->gtOp.gtOp1;
+            GenTree* op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr;
 
             if (op1 == base)
             {
@@ -2832,7 +3312,6 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
             }
         }
         assert(foundBase && foundIndex);
-        info->srcCount--; // it gets incremented below.
     }
     else if (addr->gtOper == GT_ARR_ELEM)
     {
@@ -2845,32 +3324,23 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
         assert(addr->gtLsraInfo.srcCount >= 2);
         addr->gtLsraInfo.srcCount -= 1;
     }
-    else
-    {
-        // it is nothing but a plain indir
-        info->srcCount--; // base gets added in below
-        base = addr;
-    }
-
-    if (base != nullptr)
-    {
-        info->srcCount++;
-    }
-
-    if (index != nullptr && !modifiedSources)
-    {
-        info->srcCount++;
-    }
 }
 
-void Lowering::LowerCmp(GenTreePtr tree)
+void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree)
 {
+    assert(tree->OperIsCompare());
+
     TreeNodeInfo* info = &(tree->gtLsraInfo);
 
     info->srcCount = 2;
     info->dstCount = 1;
 
 #ifdef _TARGET_X86_
+    // If the compare is used by a jump, we just need to set the condition codes. If not, then we need
+    // to store the result into the low byte of a register, which requires the dst be a byteable register.
+    // We always set the dst candidates, though, because if this is compare is consumed by a jump, they
+    // won't be used. We might be able to use GTF_RELOP_JMP_USED to determine this case, but it's not clear
+    // that flag is maintained until this location (especially for decomposed long compares).
     info->setDstCandidates(m_lsra, RBM_BYTE_REGS);
 #endif // _TARGET_X86_
 
@@ -2894,9 +3364,9 @@ void Lowering::LowerCmp(GenTreePtr tree)
 #endif // !defined(_TARGET_64BIT_)
 
     // If either of op1 or op2 is floating point values, then we need to use
-    // ucomiss or ucomisd to compare, both of which support the following form
-    // ucomis[s|d] xmm, xmm/mem.  That is only the second operand can be a memory
-    // op.
+    // ucomiss or ucomisd to compare, both of which support the following form:
+    //     ucomis[s|d] xmm, xmm/mem
+    // That is only the second operand can be a memory op.
     //
     // Second operand is a memory Op:  Note that depending on comparison operator,
     // the operands of ucomis[s|d] need to be reversed.  Therefore, either op1 or
@@ -2952,16 +3422,9 @@ void Lowering::LowerCmp(GenTreePtr tree)
     bool hasShortCast = false;
     if (CheckImmedAndMakeContained(tree, op2))
     {
-        bool op1CanBeContained = (op1Type == op2Type);
-        if (!op1CanBeContained)
-        {
-            if (genTypeSize(op1Type) == genTypeSize(op2Type))
-            {
-                // The constant is of the correct size, but we don't have an exact type match
-                // We can treat the isMemoryOp as "contained"
-                op1CanBeContained = true;
-            }
-        }
+        // If the types are the same, or if the constant is of the correct size,
+        // we can treat the isMemoryOp as contained.
+        bool op1CanBeContained = (genTypeSize(op1Type) == genTypeSize(op2Type));
 
         // Do we have a short compare against a constant in op2
         //
@@ -3031,13 +3494,13 @@ void Lowering::LowerCmp(GenTreePtr tree)
                 bool op1IsMadeContained = false;
 
                 // When op1 is a GT_AND we can often generate a single "test" instruction
-                // instead of two instructions (an "and" instruction followed by a "cmp"/"test")
+                // instead of two instructions (an "and" instruction followed by a "cmp"/"test").
                 //
-                // This instruction can only be used for equality or inequality comparions.
+                // This instruction can only be used for equality or inequality comparisons.
                 // and we must have a compare against zero.
                 //
                 // If we have a postive test for a single bit we can reverse the condition and
-                // make the compare be against zero
+                // make the compare be against zero.
                 //
                 // Example:
                 //                  GT_EQ                              GT_NE
@@ -3046,8 +3509,8 @@ void Lowering::LowerCmp(GenTreePtr tree)
                 //             /    \                             /    \
                 //          andOp1  GT_CNS (0x100)             andOp1  GT_CNS (0x100)
                 //
-                // We will mark the GT_AND node as contained if the tree is a equality compare with zero
-                // Additionally when we do this we also allow for a contained memory operand for "andOp1".
+                // We will mark the GT_AND node as contained if the tree is an equality compare with zero.
+                // Additionally, when we do this we also allow for a contained memory operand for "andOp1".
                 //
                 bool isEqualityCompare = (tree->gtOper == GT_EQ || tree->gtOper == GT_NE);
 
@@ -3066,7 +3529,7 @@ void Lowering::LowerCmp(GenTreePtr tree)
                             // so that we can generate a test instruction.
 
                             // Reverse the equality comparison
-                            tree->gtOper = (tree->gtOper == GT_EQ) ? GT_NE : GT_EQ;
+                            tree->SetOperRaw((tree->gtOper == GT_EQ) ? GT_NE : GT_EQ);
 
                             // Change the relOp2CnsVal to zero
                             relOp2CnsVal = 0;
@@ -3171,7 +3634,7 @@ void Lowering::LowerCmp(GenTreePtr tree)
                     genTreeOps castOp1Oper   = castOp1->OperGet();
                     bool       safeOper      = false;
 
-                    // It is not always safe to change the gtType of 'castOp1' to TYP_UBYTE
+                    // It is not always safe to change the gtType of 'castOp1' to TYP_UBYTE.
                     // For example when 'castOp1Oper' is a GT_RSZ or GT_RSH then we are shifting
                     // bits from the left into the lower bits.  If we change the type to a TYP_UBYTE
                     // we will instead generate a byte sized shift operation:  shr  al, 24
@@ -3196,22 +3659,24 @@ void Lowering::LowerCmp(GenTreePtr tree)
                         //
                         assert(!castOp1->gtOverflowEx()); // Must not be an overflow checking operation
 
-                        GenTreePtr removeTreeNode = op1;
-                        tree->gtOp.gtOp1          = castOp1;
-                        op1                       = castOp1;
-                        castOp1->gtType           = TYP_UBYTE;
-
-                        // trim down the value if castOp1 is an int constant since its type changed to UBYTE.
-                        if (castOp1Oper == GT_CNS_INT)
-                        {
-                            castOp1->gtIntCon.gtIconVal = (UINT8)castOp1->gtIntCon.gtIconVal;
-                        }
-
+                        // TODO-Cleanup: we're within "if (CheckImmedAndMakeContained(tree, op2))", so isn't
+                        // the following condition always true?
                         if (op2->isContainedIntOrIImmed())
                         {
                             ssize_t val = (ssize_t)op2->AsIntConCommon()->IconValue();
                             if (val >= 0 && val <= 255)
                             {
+                                GenTreePtr removeTreeNode = op1;
+                                tree->gtOp.gtOp1          = castOp1;
+                                op1                       = castOp1;
+                                castOp1->gtType           = TYP_UBYTE;
+
+                                // trim down the value if castOp1 is an int constant since its type changed to UBYTE.
+                                if (castOp1Oper == GT_CNS_INT)
+                                {
+                                    castOp1->gtIntCon.gtIconVal = (UINT8)castOp1->gtIntCon.gtIconVal;
+                                }
+
                                 op2->gtType = TYP_UBYTE;
                                 tree->gtFlags |= GTF_UNSIGNED;
 
@@ -3222,18 +3687,26 @@ void Lowering::LowerCmp(GenTreePtr tree)
                                     MakeSrcContained(tree, op1);
                                     op1IsMadeContained = true;
                                 }
-                            }
-                        }
 
-                        BlockRange().Remove(removeTreeNode);
+                                BlockRange().Remove(removeTreeNode);
+
+                                // We've changed the type on op1 to TYP_UBYTE, but we already processed that node.
+                                // We need to go back and mark it byteable.
+                                // TODO-Cleanup: it might be better to move this out of the TreeNodeInfoInit pass to
+                                // the earlier "lower" pass, in which case the byteable check would just fall out.
+                                // But that is quite complex!
+                                TreeNodeInfoInitCheckByteable(op1);
+
 #ifdef DEBUG
-                        if (comp->verbose)
-                        {
-                            printf("LowerCmp: Removing a GT_CAST to TYP_UBYTE and changing castOp1->gtType to "
-                                   "TYP_UBYTE\n");
-                            comp->gtDispTreeRange(BlockRange(), tree);
-                        }
+                                if (comp->verbose)
+                                {
+                                    printf("TreeNodeInfoInitCmp: Removing a GT_CAST to TYP_UBYTE and changing "
+                                           "castOp1->gtType to TYP_UBYTE\n");
+                                    comp->gtDispTreeRange(BlockRange(), tree);
+                                }
 #endif
+                            }
+                        }
                     }
                 }
 
@@ -3241,6 +3714,41 @@ void Lowering::LowerCmp(GenTreePtr tree)
                 if (!op1IsMadeContained)
                 {
                     SetRegOptional(op1);
+
+                    // If op1 codegen sets ZF and SF flags and ==/!= against
+                    // zero, we don't need to generate test instruction,
+                    // provided we don't have another GenTree node between op1
+                    // and tree that could potentially modify flags.
+                    //
+                    // TODO-CQ: right now the below peep is inexpensive and
+                    // gets the benefit in most of cases because in majority
+                    // of cases op1, op2 and tree would be in that order in
+                    // execution.  In general we should be able to check that all
+                    // the nodes that come after op1 in execution order do not
+                    // modify the flags so that it is safe to avoid generating a
+                    // test instruction.  Such a check requires that on each
+                    // GenTree node we need to set the info whether its codegen
+                    // will modify flags.
+                    //
+                    // TODO-CQ: We can optimize compare against zero in the
+                    // following cases by generating the branch as indicated
+                    // against each case.
+                    //  1) unsigned compare
+                    //        < 0  - always FALSE
+                    //       <= 0  - ZF=1 and jne
+                    //        > 0  - ZF=0 and je
+                    //       >= 0  - always TRUE
+                    //
+                    // 2) signed compare
+                    //        < 0  - SF=1 and js
+                    //       >= 0  - SF=0 and jns
+                    if (isEqualityCompare && op1->gtSetZSFlags() && op2->IsIntegralConst(0) && (op1->gtNext == op2) &&
+                        (op2->gtNext == tree))
+                    {
+                        // Require codegen of op1 to set the flags.
+                        assert(!op1->gtSetFlags());
+                        op1->gtFlags |= GTF_SET_FLAGS;
+                    }
                 }
             }
         }
@@ -3255,10 +3763,17 @@ void Lowering::LowerCmp(GenTreePtr tree)
         {
             MakeSrcContained(tree, op1);
         }
+        else if (op1->IsCnsIntOrI())
+        {
+            // TODO-CQ: We should be able to support swapping op1 and op2 to generate cmp reg, imm,
+            // but there is currently an assert in CodeGen::genCompareInt().
+            // https://github.com/dotnet/coreclr/issues/7270
+            SetRegOptional(op2);
+        }
         else
         {
             // One of op1 or op2 could be marked as reg optional
-            // to indicate that codgen can still generate code
+            // to indicate that codegen can still generate code
             // if one of them is on stack.
             SetRegOptional(PreferredRegOptionalOperand(tree));
         }
@@ -3318,7 +3833,6 @@ void Lowering::LowerCast(GenTree* tree)
     var_types  dstType = tree->CastToType();
     var_types  srcType = op1->TypeGet();
     var_types  tmpType = TYP_UNDEF;
-    bool       srcUns  = false;
 
     // force the srcType to unsigned if GT_UNSIGNED flag is set
     if (tree->gtFlags & GTF_UNSIGNED)
@@ -3849,6 +4363,20 @@ bool Lowering::SetStoreIndOpCountsIfRMWMemOp(GenTreePtr storeInd)
     }
     m_lsra->clearOperandCounts(indirCandidateChild);
 
+#ifdef _TARGET_X86_
+    if (varTypeIsByte(storeInd))
+    {
+        // If storeInd is of TYP_BYTE, set indirOpSources to byteable registers.
+        bool containedNode = indirOpSource->gtLsraInfo.dstCount == 0;
+        if (!containedNode)
+        {
+            regMaskTP regMask = indirOpSource->gtLsraInfo.getSrcCandidates(m_lsra);
+            assert(regMask != RBM_NONE);
+            indirOpSource->gtLsraInfo.setSrcCandidates(m_lsra, regMask & ~RBM_NON_BYTE_REGS);
+        }
+    }
+#endif
+
     return true;
 }
 
@@ -3858,8 +4386,11 @@ bool Lowering::SetStoreIndOpCountsIfRMWMemOp(GenTreePtr storeInd)
  */
 void Lowering::SetMulOpCounts(GenTreePtr tree)
 {
+#if defined(_TARGET_X86_)
+    assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI || tree->OperGet() == GT_MUL_LONG);
+#else
     assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI);
-
+#endif
     TreeNodeInfo* info = &(tree->gtLsraInfo);
 
     info->srcCount = 2;
@@ -3900,13 +4431,18 @@ void Lowering::SetMulOpCounts(GenTreePtr tree)
     GenTreeIntConCommon* imm                    = nullptr;
     GenTreePtr           other                  = nullptr;
 
-    // There are three forms of x86 multiply:
-    // one-op form:     RDX:RAX = RAX * r/m
-    // two-op form:     reg *= r/m
-    // three-op form:   reg = r/m * imm
+// There are three forms of x86 multiply:
+// one-op form:     RDX:RAX = RAX * r/m
+// two-op form:     reg *= r/m
+// three-op form:   reg = r/m * imm
 
-    // This special widening 32x32->64 MUL is not used on x64
-    assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
+// This special widening 32x32->64 MUL is not used on x64
+#if defined(_TARGET_X86_)
+    if (tree->OperGet() != GT_MUL_LONG)
+#endif
+    {
+        assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
+    }
 
     // Multiply should never be using small types
     assert(!varTypeIsSmall(tree->TypeGet()));
@@ -3924,12 +4460,21 @@ void Lowering::SetMulOpCounts(GenTreePtr tree)
         info->setDstCandidates(m_lsra, RBM_RAX);
         hasImpliedFirstOperand = true;
     }
-    else if (tree->gtOper == GT_MULHI)
+    else if (tree->OperGet() == GT_MULHI)
+    {
+        // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
+        // upper 32 bits of the result set the destination candidate to REG_RDX.
+        info->setDstCandidates(m_lsra, RBM_RDX);
+        hasImpliedFirstOperand = true;
+    }
+#if defined(_TARGET_X86_)
+    else if (tree->OperGet() == GT_MUL_LONG)
     {
         // have to use the encoding:RDX:RAX = RAX * rm
         info->setDstCandidates(m_lsra, RBM_RAX);
         hasImpliedFirstOperand = true;
     }
+#endif
     else if (IsContainableImmed(tree, op2) || IsContainableImmed(tree, op1))
     {
         if (IsContainableImmed(tree, op2))
@@ -4187,6 +4732,71 @@ GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree)
     return preferredOp;
 }
 
+#ifdef _TARGET_X86_
+//------------------------------------------------------------------------
+// ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for
+// various reasons
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    If we need to exclude non-byteable registers
+//
+bool Lowering::ExcludeNonByteableRegisters(GenTree* tree)
+{
+    // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
+    // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
+    // value. In this case we need to exclude esi/edi from the src candidates of op2.
+    if (varTypeIsByte(tree))
+    {
+        return true;
+    }
+    // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
+    else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType()))
+    {
+        return true;
+    }
+    else if (tree->OperIsCompare())
+    {
+        GenTree* op1 = tree->gtGetOp1();
+        GenTree* op2 = tree->gtGetOp2();
+
+        // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
+        // ubyte as the result of comparison and if the result needs to be materialized into a reg
+        // simply zero extend it to TYP_INT size.  Here is an example of generated code:
+        //         cmp dl, byte ptr[addr mode]
+        //         movzx edx, dl
+        if (varTypeIsByte(op1) && varTypeIsByte(op2))
+        {
+            return true;
+        }
+        // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses
+        // ubyte as the result of the comparison and if the result needs to be materialized into a reg
+        // simply zero extend it to TYP_INT size.
+        else if (varTypeIsByte(op1) && op2->IsCnsIntOrI())
+        {
+            return true;
+        }
+        // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses
+        // ubyte as the result of the comparison and if the result needs to be materialized into a reg
+        // simply zero extend it to TYP_INT size.
+        else if (op1->IsCnsIntOrI() && varTypeIsByte(op2))
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+    else
+    {
+        return false;
+    }
+}
+#endif // _TARGET_X86_
+
 #endif // _TARGET_XARCH_
 
 #endif // !LEGACY_BACKEND
author	Jiyoung Yun <jy910.yun@samsung.com>	2016-12-27 16:46:08 +0900
committer	Jiyoung Yun <jy910.yun@samsung.com>	2016-12-27 16:46:08 +0900
commit	db20f3f1bb8595633a7e16c8900fd401a453a6b5 (patch)
tree	e5435159cd1bf0519276363a6fe1663d1721bed3 /src/jit/lowerxarch.cpp
parent	4b4aad7217d3292650e77eec2cf4c198ea9c3b4b (diff)
download	coreclr-db20f3f1bb8595633a7e16c8900fd401a453a6b5.tar.gz coreclr-db20f3f1bb8595633a7e16c8900fd401a453a6b5.tar.bz2 coreclr-db20f3f1bb8595633a7e16c8900fd401a453a6b5.zip