summaryrefslogtreecommitdiff
path: root/src/jit/lowerxarch.cpp
diff options
context:
space:
mode:
authorJiyoung Yun <jy910.yun@samsung.com>2016-12-27 16:46:08 +0900
committerJiyoung Yun <jy910.yun@samsung.com>2016-12-27 16:46:08 +0900
commitdb20f3f1bb8595633a7e16c8900fd401a453a6b5 (patch)
treee5435159cd1bf0519276363a6fe1663d1721bed3 /src/jit/lowerxarch.cpp
parent4b4aad7217d3292650e77eec2cf4c198ea9c3b4b (diff)
downloadcoreclr-db20f3f1bb8595633a7e16c8900fd401a453a6b5.tar.gz
coreclr-db20f3f1bb8595633a7e16c8900fd401a453a6b5.tar.bz2
coreclr-db20f3f1bb8595633a7e16c8900fd401a453a6b5.zip
Imported Upstream version 1.0.0.9127upstream/1.0.0.9127
Diffstat (limited to 'src/jit/lowerxarch.cpp')
-rw-r--r--src/jit/lowerxarch.cpp1104
1 files changed, 857 insertions, 247 deletions
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 6f98eb6661..589cef482e 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -77,7 +77,7 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
// InitBlk
MakeSrcContained(storeLoc, op1);
}
- else if (storeLoc->TypeGet() == TYP_SIMD12)
+ else if ((storeLoc->TypeGet() == TYP_SIMD12) && (storeLoc->OperGet() == GT_STORE_LCL_FLD))
{
// Need an additional register to extract upper 4 bytes of Vector3.
info->internalFloatCount = 1;
@@ -177,6 +177,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
break;
case GT_LCL_FLD:
+ case GT_LCL_VAR:
info->srcCount = 0;
info->dstCount = 1;
@@ -185,9 +186,9 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
if (tree->TypeGet() == TYP_SIMD12)
{
// We need an internal register different from targetReg in which 'tree' produces its result
- // because both targetReg and internal reg will be in use at the same time. This is achieved
- // by asking for two internal registers.
- info->internalFloatCount = 2;
+ // because both targetReg and internal reg will be in use at the same time.
+ info->internalFloatCount = 1;
+ info->isInternalRegDelayFree = true;
info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
}
#endif
@@ -195,7 +196,16 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
case GT_STORE_LCL_FLD:
case GT_STORE_LCL_VAR:
- info->srcCount = 1;
+#ifdef _TARGET_X86_
+ if (tree->gtGetOp1()->OperGet() == GT_LONG)
+ {
+ info->srcCount = 2;
+ }
+ else
+#endif // _TARGET_X86_
+ {
+ info->srcCount = 1;
+ }
info->dstCount = 0;
LowerStoreLoc(tree->AsLclVarCommon());
break;
@@ -242,6 +252,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
break;
case GT_LIST:
+ case GT_FIELD_LIST:
case GT_ARGPLACE:
case GT_NO_OP:
case GT_START_NONGC:
@@ -319,9 +330,87 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
break;
case GT_JTRUE:
+ {
+ info->srcCount = 0;
+ info->dstCount = 0;
+
+ GenTree* cmp = tree->gtGetOp1();
+ l->clearDstCount(cmp);
+
+#ifdef FEATURE_SIMD
+ // Say we have the following IR
+ // simdCompareResult = GT_SIMD((In)Equality, v1, v2)
+ // integerCompareResult = GT_EQ/NE(simdCompareResult, true/false)
+ // GT_JTRUE(integerCompareResult)
+ //
+ // In this case we don't need to generate code for GT_EQ_/NE, since SIMD (In)Equality
+ // intrinsic would set or clear Zero flag.
+
+ genTreeOps cmpOper = cmp->OperGet();
+ if (cmpOper == GT_EQ || cmpOper == GT_NE)
+ {
+ GenTree* cmpOp1 = cmp->gtGetOp1();
+ GenTree* cmpOp2 = cmp->gtGetOp2();
+
+ if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1)))
+ {
+ // clear dstCount on SIMD node to indicate that
+ // result doesn't need to be materialized into a register.
+ l->clearOperandCounts(cmp);
+ l->clearDstCount(cmpOp1);
+ l->clearOperandCounts(cmpOp2);
+
+ // Codegen of SIMD (in)Equality uses target integer reg
+ // only for setting flags. Target reg is not needed on AVX
+ // when comparing against Vector Zero. In all other cases
+ // we need to reserve an int type internal register, since we
+ // have cleared dstCount.
+ if (compiler->canUseAVX() && cmpOp1->gtGetOp2()->IsIntegralConstVector(0))
+ {
+ // We don't need an internal register,since we use vptest
+ // for setting flags.
+ }
+ else
+ {
+ ++(cmpOp1->gtLsraInfo.internalIntCount);
+ regMaskTP internalCandidates = cmpOp1->gtLsraInfo.getInternalCandidates(l);
+ internalCandidates |= l->allRegs(TYP_INT);
+ cmpOp1->gtLsraInfo.setInternalCandidates(l, internalCandidates);
+ }
+
+ // We would have to reverse compare oper in the following cases:
+ // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it.
+ // Therefore, if compare oper is == or != against false(0), we will
+ // be checking opposite of what is required.
+ //
+ // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it.
+ // Therefore, if compare oper is == or != against true(1), we will
+ // be checking opposite of what is required.
+ GenTreeSIMD* simdNode = cmpOp1->AsSIMD();
+ if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality)
+ {
+ if (cmpOp2->IsIntegralConst(0))
+ {
+ cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+ }
+ }
+ else
+ {
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality);
+ if (cmpOp2->IsIntegralConst(1))
+ {
+ cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+ }
+ }
+ }
+ }
+#endif // FEATURE_SIMD
+ }
+ break;
+
+ case GT_JCC:
info->srcCount = 0;
info->dstCount = 0;
- l->clearDstCount(tree->gtOp.gtOp1);
break;
case GT_JMP:
@@ -436,6 +525,9 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
case GT_MUL:
case GT_MULHI:
+#if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
+ case GT_MUL_LONG:
+#endif
SetMulOpCounts(tree);
break;
@@ -478,6 +570,11 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
info->internalFloatCount = 1;
info->setInternalCandidates(l, l->internalFloatRegCandidates());
}
+ else
+ {
+ // Codegen of this tree node sets ZF and SF flags.
+ tree->gtFlags |= GTF_ZSF_SET;
+ }
break;
case GT_NOT:
@@ -490,6 +587,10 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
case GT_RSZ:
case GT_ROL:
case GT_ROR:
+#ifdef _TARGET_X86_
+ case GT_LSH_HI:
+ case GT_RSH_LO:
+#endif
TreeNodeInfoInitShiftRotate(tree);
break;
@@ -499,7 +600,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
case GT_LE:
case GT_GE:
case GT_GT:
- LowerCmp(tree);
+ TreeNodeInfoInitCmp(tree);
break;
case GT_CKFINITE:
@@ -542,10 +643,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
}
break;
-#ifdef _TARGET_X86_
- case GT_OBJ:
- NYI_X86("GT_OBJ");
-#elif !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#if !defined(FEATURE_PUT_STRUCT_ARG_STK)
case GT_OBJ:
#endif
case GT_BLK:
@@ -556,11 +654,11 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
info->dstCount = 0;
break;
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
case GT_PUTARG_STK:
- TreeNodeInfoInitPutArgStk(tree);
+ TreeNodeInfoInitPutArgStk(tree->AsPutArgStk());
break;
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+#endif // FEATURE_PUT_STRUCT_ARG_STK
case GT_STORE_BLK:
case GT_STORE_OBJ:
@@ -568,6 +666,12 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
TreeNodeInfoInitBlockStore(tree->AsBlk());
break;
+ case GT_INIT_VAL:
+ // Always a passthrough of its child's value.
+ info->srcCount = 0;
+ info->dstCount = 0;
+ break;
+
case GT_LCLHEAP:
TreeNodeInfoInitLclHeap(tree);
break;
@@ -634,14 +738,20 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
case GT_ARR_OFFSET:
// This consumes the offset, if any, the arrObj and the effective index,
// and produces the flattened offset for this dimension.
- info->srcCount = 3;
- info->dstCount = 1;
- info->internalIntCount = 1;
+ info->srcCount = 3;
+ info->dstCount = 1;
+
// we don't want to generate code for this
if (tree->gtArrOffs.gtOffset->IsIntegralConst(0))
{
MakeSrcContained(tree, tree->gtArrOffs.gtOffset);
}
+ else
+ {
+ // Here we simply need an internal register, which must be different
+ // from any of the operand's registers, but may be the same as targetReg.
+ info->internalIntCount = 1;
+ }
break;
case GT_LEA:
@@ -725,15 +835,9 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
#endif
case GT_CLS_VAR:
- info->srcCount = 0;
- // GT_CLS_VAR, by the time we reach the backend, must always
- // be a pure use.
- // It will produce a result of the type of the
- // node, and use an internal register for the address.
-
- info->dstCount = 1;
- assert((tree->gtFlags & (GTF_VAR_DEF | GTF_VAR_USEASG | GTF_VAR_USEDEF)) == 0);
- info->internalIntCount = 1;
+ // These nodes are eliminated by rationalizer.
+ JITDUMP("Unexpected node %s in Lower.\n", GenTree::NodeName(tree->OperGet()));
+ unreached();
break;
} // end switch (tree->OperGet())
@@ -813,27 +917,36 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
}
}
+ TreeNodeInfoInitCheckByteable(tree);
+
+ // We need to be sure that we've set info->srcCount and info->dstCount appropriately
+ assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCheckByteable: Check the tree to see if "byte-able" registers are
+// required, and set the tree node info accordingly.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitCheckByteable(GenTree* tree)
+{
#ifdef _TARGET_X86_
+ LinearScan* l = m_lsra;
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+
// Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
// if the tree node is a byte type.
//
- // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
- // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
- // value. In this case we need to exclude esi/edi from the src candidates of op2.
- //
- // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
- //
- // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
- // ubyte as the result of comparison and if the result needs to be materialized into a reg
- // simply zero extend it to TYP_INT size. Here is an example of generated code:
- // cmp dl, byte ptr[addr mode]
- // movzx edx, dl
- //
// Though this looks conservative in theory, in practice we could not think of a case where
// the below logic leads to conservative register specification. In future when or if we find
// one such case, this logic needs to be fine tuned for that case(s).
- if (varTypeIsByte(tree) || ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType())) ||
- (tree->OperIsCompare() && varTypeIsByte(tree->gtGetOp1()) && varTypeIsByte(tree->gtGetOp2())))
+
+ if (ExcludeNonByteableRegisters(tree))
{
regMaskTP regMask;
if (info->dstCount > 0)
@@ -870,9 +983,6 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
}
}
#endif //_TARGET_X86_
-
- // We need to be sure that we've set info->srcCount and info->dstCount appropriately
- assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
}
//------------------------------------------------------------------------
@@ -1028,6 +1138,31 @@ void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree)
GenTreePtr shiftBy = tree->gtOp.gtOp2;
GenTreePtr source = tree->gtOp.gtOp1;
+#ifdef _TARGET_X86_
+ // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
+ // we can have a three operand form. Increment the srcCount.
+ if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
+ {
+ assert(source->OperGet() == GT_LONG);
+
+ info->srcCount++;
+
+ if (tree->OperGet() == GT_LSH_HI)
+ {
+ GenTreePtr sourceLo = source->gtOp.gtOp1;
+ sourceLo->gtLsraInfo.isDelayFree = true;
+ }
+ else
+ {
+ GenTreePtr sourceHi = source->gtOp.gtOp2;
+ sourceHi->gtLsraInfo.isDelayFree = true;
+ }
+
+ source->gtLsraInfo.hasDelayFreeSrc = true;
+ info->hasDelayFreeSrc = true;
+ }
+#endif
+
// x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
// We will allow whatever can be encoded - hope you know what you are doing.
if (!IsContainableImmed(tree, shiftBy) || (shiftBy->gtIntConCommon.IconValue() > 255) ||
@@ -1040,6 +1175,17 @@ void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree)
else
{
MakeSrcContained(tree, shiftBy);
+
+ // Note that Rotate Left/Right instructions don't set ZF and SF flags.
+ //
+ // If the operand being shifted is 32-bits then upper three bits are masked
+ // by hardware to get actual shift count. Similarly for 64-bit operands
+ // shift count is narrowed to [0..63]. If the resulting shift count is zero,
+ // then shift operation won't modify flags.
+ //
+ // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
+ // if the shift count is known to be non-zero and in the range depending on the
+ // operand size.
}
}
@@ -1088,6 +1234,12 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
assert(ctrlExpr == nullptr);
assert(call->gtCallAddr != nullptr);
ctrlExpr = call->gtCallAddr;
+
+#ifdef _TARGET_X86_
+ // Fast tail calls aren't currently supported on x86, but if they ever are, the code
+ // below that handles indirect VSD calls will need to be fixed.
+ assert(!call->IsFastTailCall() || !call->IsVirtualStub());
+#endif // _TARGET_X86_
}
// set reg requirements on call target represented as control sequence.
@@ -1103,7 +1255,24 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
// computed into a register.
if (!call->IsFastTailCall())
{
- if (ctrlExpr->isIndir())
+#ifdef _TARGET_X86_
+ // On x86, we need to generate a very specific pattern for indirect VSD calls:
+ //
+ // 3-byte nop
+ // call dword ptr [eax]
+ //
+ // Where EAX is also used as an argument to the stub dispatch helper. Make
+ // sure that the call target address is computed into EAX in this case.
+ if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
+ {
+ assert(ctrlExpr->isIndir());
+
+ ctrlExpr->gtGetOp1()->gtLsraInfo.setSrcCandidates(l, RBM_VIRTUAL_STUB_TARGET);
+ MakeSrcContained(call, ctrlExpr);
+ }
+ else
+#endif // _TARGET_X86_
+ if (ctrlExpr->isIndir())
{
MakeSrcContained(call, ctrlExpr);
}
@@ -1191,7 +1360,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
// First, count reg args
for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
{
- assert(list->IsList());
+ assert(list->OperIsList());
GenTreePtr argNode = list->Current();
@@ -1206,7 +1375,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
argNode->gtLsraInfo.srcCount = 1;
argNode->gtLsraInfo.dstCount = 0;
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
// If the node is TYP_STRUCT and it is put on stack with
// putarg_stk operation, we consume and produce no registers.
// In this case the embedded Obj node should not produce
@@ -1218,7 +1387,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0;
argNode->gtLsraInfo.srcCount = 0;
}
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+#endif // FEATURE_PUT_STRUCT_ARG_STK
continue;
}
@@ -1248,7 +1417,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
// If the struct arg is wrapped in CPYBLK the type of the param will be TYP_VOID.
// Use the curArgTabEntry's isStruct to get whether the param is a struct.
- if (varTypeIsStruct(argNode) FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(|| curArgTabEntry->isStruct))
+ if (varTypeIsStruct(argNode) PUT_STRUCT_ARG_STK_ONLY(|| curArgTabEntry->isStruct))
{
unsigned originalSize = 0;
LclVarDsc* varDsc = nullptr;
@@ -1270,16 +1439,16 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
{
originalSize = genTypeSize(argNode->gtType);
}
- else if (argNode->gtOper == GT_LIST)
+ else if (argNode->gtOper == GT_FIELD_LIST)
{
originalSize = 0;
// There could be up to 2 PUTARG_REGs in the list
- GenTreeArgList* argListPtr = argNode->AsArgList();
- unsigned iterationNum = 0;
- for (; argListPtr; argListPtr = argListPtr->Rest())
+ GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
+ unsigned iterationNum = 0;
+ for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest())
{
- GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1;
+ GenTreePtr putArgRegNode = fieldListPtr->Current();
assert(putArgRegNode->gtOper == GT_PUTARG_REG);
if (iterationNum == 0)
@@ -1509,7 +1678,7 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
}
m_lsra->clearOperandCounts(source);
}
- else if (!source->OperIsSIMD())
+ else if (!source->IsMultiRegCall() && !source->OperIsSIMD())
{
assert(source->IsLocal());
MakeSrcContained(blkNode, source);
@@ -1519,7 +1688,11 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
if (isInitBlk)
{
GenTree* initVal = source;
- srcAddrOrFill = source;
+ if (initVal->OperIsInitVal())
+ {
+ initVal = initVal->gtGetOp1();
+ }
+ srcAddrOrFill = initVal;
// If we have an InitBlk with constant block size we can optimize several ways:
// a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes
// we use rep stosb since this reduces the register pressure in LSRA and we have
@@ -1571,8 +1744,23 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
// a pack of 16 init value constants.
blkNode->gtLsraInfo.internalFloatCount = 1;
blkNode->gtLsraInfo.setInternalCandidates(l, l->internalFloatRegCandidates());
+ if ((fill == 0) && ((size & 0xf) == 0))
+ {
+ MakeSrcContained(blkNode, source);
+ }
}
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
+
+#ifdef _TARGET_X86_
+ if ((size & 1) != 0)
+ {
+ // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
+ // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
+ // when unrolling, so only allow byteable registers as the source value. (We could
+ // consider just using BlkOpKindRepInstr instead.)
+ sourceRegMask = RBM_BYTE_REGS;
+ }
+#endif // _TARGET_X86_
}
else
{
@@ -1825,7 +2013,7 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
}
}
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
//------------------------------------------------------------------------
// TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
//
@@ -1835,44 +2023,219 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
// Return Value:
// None.
//
-void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree)
+void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
{
- TreeNodeInfo* info = &(tree->gtLsraInfo);
+ TreeNodeInfo* info = &(putArgStk->gtLsraInfo);
LinearScan* l = m_lsra;
- if (tree->TypeGet() != TYP_STRUCT)
+#ifdef _TARGET_X86_
+ if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
+ {
+ putArgStk->gtNumberReferenceSlots = 0;
+ putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Invalid;
+
+ GenTreeFieldList* fieldList = putArgStk->gtOp1->AsFieldList();
+
+ // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
+ // of uses is visible to LSRA.
+ unsigned fieldCount = 0;
+ GenTreeFieldList* head = nullptr;
+ for (GenTreeFieldList *current = fieldList, *next; current != nullptr; current = next)
+ {
+ next = current->Rest();
+
+ // First, insert the field node into the sorted list.
+ GenTreeFieldList* prev = nullptr;
+ for (GenTreeFieldList* cursor = head;; cursor = cursor->Rest())
+ {
+ // If the offset of the current list node is greater than the offset of the cursor or if we have
+ // reached the end of the list, insert the current node before the cursor and terminate.
+ if ((cursor == nullptr) || (current->gtFieldOffset > cursor->gtFieldOffset))
+ {
+ if (prev == nullptr)
+ {
+ assert(cursor == head);
+ head = current;
+ }
+ else
+ {
+ prev->Rest() = current;
+ }
+
+ current->Rest() = cursor;
+ break;
+ }
+ }
+
+ fieldCount++;
+ }
+
+ info->srcCount = fieldCount;
+ info->dstCount = 0;
+
+ // In theory, the upper bound for the size of a field list is 8: these constructs only appear when passing the
+ // collection of lclVars that represent the fields of a promoted struct lclVar, and we do not promote struct
+ // lclVars with more than 4 fields. If each of these lclVars is of type long, decomposition will split the
+ // corresponding field list nodes in two, giving an upper bound of 8.
+ //
+ // The reason that this is important is that the algorithm we use above to sort the field list is O(N^2): if
+ // the maximum size of a field list grows significantly, we will need to reevaluate it.
+ assert(fieldCount <= 8);
+
+ // The sort above may have changed which node is at the head of the list. Update the PUTARG_STK node if
+ // necessary.
+ if (head != fieldList)
+ {
+ head->gtFlags |= GTF_FIELD_LIST_HEAD;
+ fieldList->gtFlags &= ~GTF_FIELD_LIST_HEAD;
+
+#ifdef DEBUG
+ head->gtSeqNum = fieldList->gtSeqNum;
+#endif // DEBUG
+
+ head->gtLsraInfo = fieldList->gtLsraInfo;
+ head->gtClearReg(comp);
+
+ BlockRange().InsertAfter(fieldList, head);
+ BlockRange().Remove(fieldList);
+
+ fieldList = head;
+ putArgStk->gtOp1 = fieldList;
+ }
+
+ // Now that the fields have been sorted, initialize the LSRA info.
+ bool allFieldsAreSlots = true;
+ bool needsByteTemp = false;
+ unsigned prevOffset = putArgStk->getArgSize();
+ for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
+ {
+ GenTree* const fieldNode = current->Current();
+ const var_types fieldType = fieldNode->TypeGet();
+ const unsigned fieldOffset = current->gtFieldOffset;
+ assert(fieldType != TYP_LONG);
+
+ // For x86 we must mark all integral fields as contained or reg-optional, and handle them
+ // accordingly in code generation, since we may have up to 8 fields, which cannot all be in
+ // registers to be consumed atomically by the call.
+ if (varTypeIsIntegralOrI(fieldNode))
+ {
+ if (fieldNode->OperGet() == GT_LCL_VAR)
+ {
+ LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]);
+ if (varDsc->lvTracked && !varDsc->lvDoNotEnregister)
+ {
+ SetRegOptional(fieldNode);
+ }
+ else
+ {
+ MakeSrcContained(putArgStk, fieldNode);
+ }
+ }
+ else if (fieldNode->IsIntCnsFitsInI32())
+ {
+ MakeSrcContained(putArgStk, fieldNode);
+ }
+ else
+ {
+ // For the case where we cannot directly push the value, if we run out of registers,
+ // it would be better to defer computation until we are pushing the arguments rather
+ // than spilling, but this situation is not all that common, as most cases of promoted
+ // structs do not have a large number of fields, and of those most are lclVars or
+ // copy-propagated constants.
+ SetRegOptional(fieldNode);
+ }
+ }
+ else
+ {
+ assert(varTypeIsFloating(fieldNode));
+ }
+
+ // We can treat as a slot any field that is stored at a slot boundary, where the previous
+ // field is not in the same slot. (Note that we store the fields in reverse order.)
+ const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
+ if (!fieldIsSlot)
+ {
+ allFieldsAreSlots = false;
+ if (varTypeIsByte(fieldType))
+ {
+ // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
+ // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
+ // need a byte-addressable register for the store. We will enforce this requirement on an internal
+ // register, which we can use to copy multiple byte values.
+ needsByteTemp = true;
+ }
+ }
+
+ if (varTypeIsGC(fieldType))
+ {
+ putArgStk->gtNumberReferenceSlots++;
+ }
+
+ prevOffset = fieldOffset;
+ }
+
+ // Set the copy kind.
+ // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should
+ // adjust the stack once for those fields. The latter is really best done in code generation, but
+ // this tuning should probably be undertaken as a whole.
+ // Also, if there are floating point fields, it may be better to use the "Unroll" mode
+ // of copying the struct as a whole, if the fields are not register candidates.
+ if (allFieldsAreSlots)
+ {
+ putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots;
+ }
+ else
+ {
+ putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
+ // If any of the fields cannot be stored with an actual push, we may need a temporary
+ // register to load the value before storing it to the stack location.
+ info->internalIntCount = 1;
+ regMaskTP regMask = l->allRegs(TYP_INT);
+ if (needsByteTemp)
+ {
+ regMask &= ~RBM_NON_BYTE_REGS;
+ }
+ info->setInternalCandidates(l, regMask);
+ }
+ return;
+ }
+#endif // _TARGET_X86_
+
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+ // For PutArgStk of a TYP_SIMD12, we need an extra register.
+ if (putArgStk->TypeGet() == TYP_SIMD12)
{
- TreeNodeInfoInitSimple(tree);
+ info->srcCount = putArgStk->gtOp1->gtLsraInfo.dstCount;
+ info->dstCount = 0;
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(l, l->allSIMDRegs());
return;
}
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
- GenTreePutArgStk* putArgStkTree = tree->AsPutArgStk();
+ if (putArgStk->TypeGet() != TYP_STRUCT)
+ {
+ TreeNodeInfoInitSimple(putArgStk);
+ return;
+ }
- GenTreePtr dst = tree;
- GenTreePtr src = tree->gtOp.gtOp1;
+ GenTreePtr dst = putArgStk;
+ GenTreePtr src = putArgStk->gtOp1;
GenTreePtr srcAddr = nullptr;
+ bool haveLocalAddr = false;
if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
{
srcAddr = src->gtOp.gtOp1;
+ assert(srcAddr != nullptr);
+ haveLocalAddr = srcAddr->OperIsLocalAddr();
}
else
{
- assert(varTypeIsSIMD(tree));
- }
- info->srcCount = src->gtLsraInfo.dstCount;
-
- // If this is a stack variable address,
- // make the op1 contained, so this way
- // there is no unnecessary copying between registers.
- // To avoid assertion, increment the parent's source.
- // It is recovered below.
- bool haveLocalAddr = ((srcAddr != nullptr) && (srcAddr->OperIsLocalAddr()));
- if (haveLocalAddr)
- {
- info->srcCount += 1;
+ assert(varTypeIsSIMD(putArgStk));
}
+ info->srcCount = src->gtLsraInfo.dstCount;
info->dstCount = 0;
// In case of a CpBlk we could use a helper call. In case of putarg_stk we
@@ -1884,7 +2247,7 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree)
// This threshold will decide from using the helper or let the JIT decide to inline
// a code sequence of its choice.
ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
- ssize_t size = putArgStkTree->gtNumSlots * TARGET_POINTER_SIZE;
+ ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
// TODO-X86-CQ: The helper call either is not supported on x86 or required more work
// (I don't know which).
@@ -1892,7 +2255,7 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree)
// If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
// Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
// our framework assemblies, so this is the main code generation scheme we'll use.
- if (size <= CPBLK_UNROLL_LIMIT && putArgStkTree->gtNumberReferenceSlots == 0)
+ if (size <= CPBLK_UNROLL_LIMIT && putArgStk->gtNumberReferenceSlots == 0)
{
// If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
//
@@ -1913,46 +2276,62 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree)
info->setInternalCandidates(l, regMask);
}
+#ifdef _TARGET_X86_
+ if (size >= 8)
+#else // !_TARGET_X86_
if (size >= XMM_REGSIZE_BYTES)
+#endif // !_TARGET_X86_
{
- // If we have a buffer larger than XMM_REGSIZE_BYTES,
- // reserve an XMM register to use it for a
+ // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
+ // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
// series of 16-byte loads and stores.
info->internalFloatCount = 1;
info->addInternalCandidates(l, l->internalFloatRegCandidates());
}
- if (haveLocalAddr)
+#ifdef _TARGET_X86_
+ if (size < XMM_REGSIZE_BYTES)
{
- MakeSrcContained(putArgStkTree, srcAddr);
+ putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
}
-
- // If src or dst are on stack, we don't have to generate the address into a register
- // because it's just some constant+SP
- putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindUnroll;
+ else
+#endif // _TARGET_X86_
+ {
+ putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
+ }
+ }
+#ifdef _TARGET_X86_
+ else if (putArgStk->gtNumberReferenceSlots != 0)
+ {
+ // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
+ // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
+ putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
}
+#endif // _TARGET_X86_
else
{
info->internalIntCount += 3;
info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI));
- if (haveLocalAddr)
- {
- MakeSrcContained(putArgStkTree, srcAddr);
- }
- putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindRepInstr;
+ putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
}
// Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
- MakeSrcContained(putArgStkTree, src);
+ MakeSrcContained(putArgStk, src);
- // Balance up the inc above.
if (haveLocalAddr)
{
- info->srcCount -= 1;
+ // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
+ // copies.
+ //
+ // To avoid an assertion in MakeSrcContained, increment the parent's source count beforehand and decrement it
+ // afterwards.
+ info->srcCount++;
+ MakeSrcContained(putArgStk, srcAddr);
+ info->srcCount--;
}
}
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+#endif // FEATURE_PUT_STRUCT_ARG_STK
//------------------------------------------------------------------------
// TreeNodeInfoInitLclHeap: Set the NodeInfo for a GT_LCLHEAP.
@@ -1976,13 +2355,17 @@ void Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
// Here '-' means don't care.
//
// Size? Init Memory? # temp regs
- // 0 - 0
- // const and <=6 reg words - 0
- // const and >6 reg words Yes 0
+ // 0 - 0 (returns 0)
+ // const and <=6 reg words - 0 (pushes '0')
+ // const and >6 reg words Yes 0 (pushes '0')
// const and <PageSize No 0 (amd64) 1 (x86)
- // const and >=PageSize No 2
- // Non-const Yes 0
- // Non-const No 2
+ // (x86:tmpReg for sutracting from esp)
+ // const and >=PageSize No 2 (regCnt and tmpReg for subtracing from sp)
+ // Non-const Yes 0 (regCnt=targetReg and pushes '0')
+ // Non-const No 2 (regCnt and tmpReg for subtracting from sp)
+ //
+ // Note: Here we don't need internal register to be different from targetReg.
+ // Rather, require it to be different from operand's reg.
GenTreePtr size = tree->gtOp.gtOp1;
if (size->IsCnsIntOrI())
@@ -2121,6 +2504,9 @@ void Lowering::TreeNodeInfoInitLogicalOp(GenTree* tree)
// as reg optional.
SetRegOptionalForBinOp(tree);
}
+
+ // Codegen of this tree node sets ZF and SF flags.
+ tree->gtFlags |= GTF_ZSF_SET;
}
//------------------------------------------------------------------------
@@ -2189,15 +2575,40 @@ void Lowering::TreeNodeInfoInitModDiv(GenTree* tree)
info->setDstCandidates(l, RBM_RAX);
}
- // If possible would like to have op1 in RAX to avoid a register move
- op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+ bool op2CanBeRegOptional = true;
+#ifdef _TARGET_X86_
+ if (op1->OperGet() == GT_LONG)
+ {
+ // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
+ GenTree* loVal = op1->gtGetOp1();
+ GenTree* hiVal = op1->gtGetOp2();
+
+ // Src count is actually 3, so increment.
+ assert(op2->IsCnsIntOrI());
+ assert(tree->OperGet() == GT_UMOD);
+ info->srcCount++;
+ op2CanBeRegOptional = false;
+
+ // This situation also requires an internal register.
+ info->internalIntCount = 1;
+ info->setInternalCandidates(l, l->allRegs(TYP_INT));
+
+ loVal->gtLsraInfo.setSrcCandidates(l, RBM_EAX);
+ hiVal->gtLsraInfo.setSrcCandidates(l, RBM_EDX);
+ }
+ else
+#endif
+ {
+ // If possible would like to have op1 in RAX to avoid a register move
+ op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+ }
// divisor can be an r/m, but the memory indirection must be of the same size as the divide
if (op2->isMemoryOp() && (op2->TypeGet() == tree->TypeGet()))
{
MakeSrcContained(tree, op2);
}
- else
+ else if (op2CanBeRegOptional)
{
op2->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
@@ -2298,12 +2709,13 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
info->dstCount = 1;
switch (simdTree->gtSIMDIntrinsicID)
{
+ GenTree* op1;
GenTree* op2;
case SIMDIntrinsicInit:
{
info->srcCount = 1;
- GenTree* op1 = tree->gtOp.gtOp1;
+ op1 = tree->gtOp.gtOp1;
// This sets all fields of a SIMD struct to the given value.
// Mark op1 as contained if it is either zero or int constant of all 1's,
@@ -2377,7 +2789,8 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
info->srcCount = 2;
// SSE2 32-bit integer multiplication requires two temp regs
- if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT)
+ if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
+ comp->getSIMDInstructionSet() == InstructionSet_SSE2)
{
info->internalFloatCount = 2;
info->setInternalCandidates(lsra, lsra->allSIMDRegs());
@@ -2406,38 +2819,78 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
case SIMDIntrinsicOpEquality:
case SIMDIntrinsicOpInEquality:
- // Need two SIMD registers as scratch.
- // See genSIMDIntrinsicRelOp() for details on code sequence generate and
- // the need for two scratch registers.
- info->srcCount = 2;
- info->internalFloatCount = 2;
- info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ info->srcCount = 2;
+
+ // On SSE4/AVX, we can generate optimal code for (in)equality
+ // against zero using ptest. We can safely do the this optimization
+ // for integral vectors but not for floating-point for the reason
+ // that we have +0.0 and -0.0 and +0.0 == -0.0
+ op2 = tree->gtGetOp2();
+ if ((comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0))
+ {
+ MakeSrcContained(tree, op2);
+ }
+ else
+ {
+
+ // Need one SIMD register as scratch.
+ // See genSIMDIntrinsicRelOp() for details on code sequence generated and
+ // the need for one scratch register.
+ //
+ // Note these intrinsics produce a BOOL result, hence internal float
+ // registers reserved are guaranteed to be different from target
+ // integer register without explicitly specifying.
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ }
break;
case SIMDIntrinsicDotProduct:
- if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) ||
- (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
+ // Float/Double vectors:
+ // For SSE, or AVX with 32-byte vectors, we also need an internal register
+ // as scratch. Further we need the targetReg and internal reg to be distinct
+ // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
+ // don't need a tmpReg.
+ //
+ // 32-byte integer vector on SSE4/AVX:
+ // will take advantage of phaddd, which operates only on 128-bit xmm reg.
+ // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
+ // registers since targetReg is an int type register.
+ //
+ // See genSIMDIntrinsicDotProduct() for details on code sequence generated
+ // and the need for scratch registers.
+ if (varTypeIsFloating(simdTree->gtSIMDBaseType))
{
- // For SSE, or AVX with 32-byte vectors, we also need an internal register as scratch.
- // Further we need the targetReg and internal reg to be distinct registers.
- // This is achieved by requesting two internal registers; thus one of them
- // will be different from targetReg.
- // Note that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
- //
- // See genSIMDIntrinsicDotProduct() for details on code sequence generated and
- // the need for scratch registers.
- info->internalFloatCount = 2;
+ if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) ||
+ (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
+ {
+ info->internalFloatCount = 1;
+ info->isInternalRegDelayFree = true;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ }
+ // else don't need scratch reg(s).
+ }
+ else
+ {
+ assert(simdTree->gtSIMDBaseType == TYP_INT && comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4);
+
+ // No need to set isInternalRegDelayFree since targetReg is a
+ // an int type reg and guaranteed to be different from xmm/ymm
+ // regs.
+ info->internalFloatCount = comp->canUseAVX() ? 2 : 1;
info->setInternalCandidates(lsra, lsra->allSIMDRegs());
}
info->srcCount = 2;
break;
case SIMDIntrinsicGetItem:
+ {
// This implements get_Item method. The sources are:
// - the source SIMD struct
// - index (which element to get)
// The result is baseType of SIMD struct.
info->srcCount = 2;
+ op1 = tree->gtOp.gtOp1;
op2 = tree->gtOp.gtOp2;
// If the index is a constant, mark it as contained.
@@ -2446,48 +2899,69 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
info->srcCount = 1;
}
- // If the index is not a constant, we will use the SIMD temp location to store the vector.
- // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
- // can use that in the process of extracting the element.
- //
- // If the index is a constant and base type is a small int we can use pextrw, but on AVX
- // we will need a temp if are indexing into the upper half of the AVX register.
- // In all other cases with constant index, we need a temp xmm register to extract the
- // element if index is other than zero.
-
- if (!op2->IsCnsIntOrI())
+ if (op1->isMemoryOp())
{
- (void)comp->getSIMDInitTempVarNum();
+ MakeSrcContained(tree, op1);
+
+ // Although GT_IND of TYP_SIMD12 reserves an internal float
+ // register for reading 4 and 8 bytes from memory and
+ // assembling them into target XMM reg, it is not required
+ // in this case.
+ op1->gtLsraInfo.internalIntCount = 0;
+ op1->gtLsraInfo.internalFloatCount = 0;
}
- else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
+ else
{
- bool needFloatTemp;
- if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
- (comp->getSIMDInstructionSet() == InstructionSet_AVX))
- {
- int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
- needFloatTemp = (byteShiftCnt >= 16);
- }
- else
+ // If the index is not a constant, we will use the SIMD temp location to store the vector.
+ // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
+ // can use that in the process of extracting the element.
+ //
+ // If the index is a constant and base type is a small int we can use pextrw, but on AVX
+ // we will need a temp if are indexing into the upper half of the AVX register.
+ // In all other cases with constant index, we need a temp xmm register to extract the
+ // element if index is other than zero.
+
+ if (!op2->IsCnsIntOrI())
{
- needFloatTemp = !op2->IsIntegralConst(0);
+ (void)comp->getSIMDInitTempVarNum();
}
- if (needFloatTemp)
+ else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
{
- info->internalFloatCount = 1;
- info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ bool needFloatTemp;
+ if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
+ (comp->getSIMDInstructionSet() == InstructionSet_AVX))
+ {
+ int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
+ needFloatTemp = (byteShiftCnt >= 16);
+ }
+ else
+ {
+ needFloatTemp = !op2->IsIntegralConst(0);
+ }
+
+ if (needFloatTemp)
+ {
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ }
}
}
- break;
+ }
+ break;
case SIMDIntrinsicSetX:
case SIMDIntrinsicSetY:
case SIMDIntrinsicSetZ:
case SIMDIntrinsicSetW:
- // We need an internal integer register
- info->srcCount = 2;
- info->internalIntCount = 1;
- info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT));
+ info->srcCount = 2;
+
+ // We need an internal integer register for SSE2 codegen
+ if (comp->getSIMDInstructionSet() == InstructionSet_SSE2)
+ {
+ info->internalIntCount = 1;
+ info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT));
+ }
+
break;
case SIMDIntrinsicCast:
@@ -2592,6 +3066,8 @@ void Lowering::TreeNodeInfoInitCast(GenTree* tree)
{
if (genTypeSize(castOpType) == 8)
{
+ // Here we don't need internal register to be different from targetReg,
+ // rather require it to be different from operand's reg.
info->internalIntCount = 1;
}
}
@@ -2693,7 +3169,6 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
GenTreePtr index = nullptr;
unsigned mul, cns;
bool rev;
- bool modifiedSources = false;
#ifdef FEATURE_SIMD
// If indirTree is of TYP_SIMD12, don't mark addr as contained
@@ -2711,11 +3186,10 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
info->internalFloatCount = 1;
// In case of GT_IND we need an internal register different from targetReg and
- // both of the registers are used at the same time. This achieved by reserving
- // two internal registers
+ // both of the registers are used at the same time.
if (indirTree->OperGet() == GT_IND)
{
- (info->internalFloatCount)++;
+ info->isInternalRegDelayFree = true;
}
info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
@@ -2724,16 +3198,21 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
}
#endif // FEATURE_SIMD
- // These nodes go into an addr mode:
- // - GT_CLS_VAR_ADDR turns into a constant.
- // - GT_LCL_VAR_ADDR is a stack addr mode.
- if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
+ if ((indirTree->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0)
{
+ // The address of an indirection that requires its address in a reg.
+ // Skip any further processing that might otherwise make it contained.
+ }
+ else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
+ {
+ // These nodes go into an addr mode:
+ // - GT_CLS_VAR_ADDR turns into a constant.
+ // - GT_LCL_VAR_ADDR is a stack addr mode.
+
// make this contained, it turns into a constant that goes into an addr mode
MakeSrcContained(indirTree, addr);
}
- else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp) &&
- addr->gtLsraInfo.getDstCandidates(m_lsra) != RBM_VIRTUAL_STUB_PARAM)
+ else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))
{
// Amd64:
// We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
@@ -2755,17 +3234,10 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
}
else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr))
{
- GenTreeAddrMode* lea = addr->AsAddrMode();
- base = lea->Base();
- index = lea->Index();
-
- m_lsra->clearOperandCounts(addr);
- // The srcCount is decremented because addr is now "contained",
- // then we account for the base and index below, if they are non-null.
- info->srcCount--;
+ MakeSrcContained(indirTree, addr);
}
else if (comp->codeGen->genCreateAddrMode(addr, -1, true, 0, &rev, &base, &index, &mul, &cns, true /*nogen*/) &&
- !(modifiedSources = AreSourcesPossiblyModifiedLocals(indirTree, base, index)))
+ !AreSourcesPossiblyModifiedLocals(indirTree, base, index))
{
// An addressing mode will be constructed that may cause some
// nodes to not need a register, and cause others' lifetimes to be extended
@@ -2774,7 +3246,16 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
assert(base != addr);
m_lsra->clearOperandCounts(addr);
- GenTreePtr arrLength = nullptr;
+ const bool hasBase = base != nullptr;
+ const bool hasIndex = index != nullptr;
+ assert(hasBase || hasIndex); // At least one of a base or an index must be present.
+
+ // If the addressing mode has both a base and an index, bump its source count by one. If it only has one or the
+ // other, its source count is already correct (due to the source for the address itself).
+ if (hasBase && hasIndex)
+ {
+ info->srcCount++;
+ }
// Traverse the computation below GT_IND to find the operands
// for the addressing mode, marking the various constants and
@@ -2784,14 +3265,13 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
// up of simple arithmetic operators, and the code generator
// only traverses one leg of each node.
- bool foundBase = (base == nullptr);
- bool foundIndex = (index == nullptr);
- GenTreePtr nextChild = nullptr;
- for (GenTreePtr child = addr; child != nullptr && !child->OperIsLeaf(); child = nextChild)
+ bool foundBase = !hasBase;
+ bool foundIndex = !hasIndex;
+ for (GenTree *child = addr, *nextChild = nullptr; child != nullptr && !child->OperIsLeaf(); child = nextChild)
{
- nextChild = nullptr;
- GenTreePtr op1 = child->gtOp.gtOp1;
- GenTreePtr op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr;
+ nextChild = nullptr;
+ GenTree* op1 = child->gtOp.gtOp1;
+ GenTree* op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr;
if (op1 == base)
{
@@ -2832,7 +3312,6 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
}
}
assert(foundBase && foundIndex);
- info->srcCount--; // it gets incremented below.
}
else if (addr->gtOper == GT_ARR_ELEM)
{
@@ -2845,32 +3324,23 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
assert(addr->gtLsraInfo.srcCount >= 2);
addr->gtLsraInfo.srcCount -= 1;
}
- else
- {
- // it is nothing but a plain indir
- info->srcCount--; // base gets added in below
- base = addr;
- }
-
- if (base != nullptr)
- {
- info->srcCount++;
- }
-
- if (index != nullptr && !modifiedSources)
- {
- info->srcCount++;
- }
}
-void Lowering::LowerCmp(GenTreePtr tree)
+void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree)
{
+ assert(tree->OperIsCompare());
+
TreeNodeInfo* info = &(tree->gtLsraInfo);
info->srcCount = 2;
info->dstCount = 1;
#ifdef _TARGET_X86_
+ // If the compare is used by a jump, we just need to set the condition codes. If not, then we need
+ // to store the result into the low byte of a register, which requires the dst be a byteable register.
+ // We always set the dst candidates, though, because if this is compare is consumed by a jump, they
+ // won't be used. We might be able to use GTF_RELOP_JMP_USED to determine this case, but it's not clear
+ // that flag is maintained until this location (especially for decomposed long compares).
info->setDstCandidates(m_lsra, RBM_BYTE_REGS);
#endif // _TARGET_X86_
@@ -2894,9 +3364,9 @@ void Lowering::LowerCmp(GenTreePtr tree)
#endif // !defined(_TARGET_64BIT_)
// If either of op1 or op2 is floating point values, then we need to use
- // ucomiss or ucomisd to compare, both of which support the following form
- // ucomis[s|d] xmm, xmm/mem. That is only the second operand can be a memory
- // op.
+ // ucomiss or ucomisd to compare, both of which support the following form:
+ // ucomis[s|d] xmm, xmm/mem
+ // That is only the second operand can be a memory op.
//
// Second operand is a memory Op: Note that depending on comparison operator,
// the operands of ucomis[s|d] need to be reversed. Therefore, either op1 or
@@ -2952,16 +3422,9 @@ void Lowering::LowerCmp(GenTreePtr tree)
bool hasShortCast = false;
if (CheckImmedAndMakeContained(tree, op2))
{
- bool op1CanBeContained = (op1Type == op2Type);
- if (!op1CanBeContained)
- {
- if (genTypeSize(op1Type) == genTypeSize(op2Type))
- {
- // The constant is of the correct size, but we don't have an exact type match
- // We can treat the isMemoryOp as "contained"
- op1CanBeContained = true;
- }
- }
+ // If the types are the same, or if the constant is of the correct size,
+ // we can treat the isMemoryOp as contained.
+ bool op1CanBeContained = (genTypeSize(op1Type) == genTypeSize(op2Type));
// Do we have a short compare against a constant in op2
//
@@ -3031,13 +3494,13 @@ void Lowering::LowerCmp(GenTreePtr tree)
bool op1IsMadeContained = false;
// When op1 is a GT_AND we can often generate a single "test" instruction
- // instead of two instructions (an "and" instruction followed by a "cmp"/"test")
+ // instead of two instructions (an "and" instruction followed by a "cmp"/"test").
//
- // This instruction can only be used for equality or inequality comparions.
+ // This instruction can only be used for equality or inequality comparisons.
// and we must have a compare against zero.
//
// If we have a postive test for a single bit we can reverse the condition and
- // make the compare be against zero
+ // make the compare be against zero.
//
// Example:
// GT_EQ GT_NE
@@ -3046,8 +3509,8 @@ void Lowering::LowerCmp(GenTreePtr tree)
// / \ / \
// andOp1 GT_CNS (0x100) andOp1 GT_CNS (0x100)
//
- // We will mark the GT_AND node as contained if the tree is a equality compare with zero
- // Additionally when we do this we also allow for a contained memory operand for "andOp1".
+ // We will mark the GT_AND node as contained if the tree is an equality compare with zero.
+ // Additionally, when we do this we also allow for a contained memory operand for "andOp1".
//
bool isEqualityCompare = (tree->gtOper == GT_EQ || tree->gtOper == GT_NE);
@@ -3066,7 +3529,7 @@ void Lowering::LowerCmp(GenTreePtr tree)
// so that we can generate a test instruction.
// Reverse the equality comparison
- tree->gtOper = (tree->gtOper == GT_EQ) ? GT_NE : GT_EQ;
+ tree->SetOperRaw((tree->gtOper == GT_EQ) ? GT_NE : GT_EQ);
// Change the relOp2CnsVal to zero
relOp2CnsVal = 0;
@@ -3171,7 +3634,7 @@ void Lowering::LowerCmp(GenTreePtr tree)
genTreeOps castOp1Oper = castOp1->OperGet();
bool safeOper = false;
- // It is not always safe to change the gtType of 'castOp1' to TYP_UBYTE
+ // It is not always safe to change the gtType of 'castOp1' to TYP_UBYTE.
// For example when 'castOp1Oper' is a GT_RSZ or GT_RSH then we are shifting
// bits from the left into the lower bits. If we change the type to a TYP_UBYTE
// we will instead generate a byte sized shift operation: shr al, 24
@@ -3196,22 +3659,24 @@ void Lowering::LowerCmp(GenTreePtr tree)
//
assert(!castOp1->gtOverflowEx()); // Must not be an overflow checking operation
- GenTreePtr removeTreeNode = op1;
- tree->gtOp.gtOp1 = castOp1;
- op1 = castOp1;
- castOp1->gtType = TYP_UBYTE;
-
- // trim down the value if castOp1 is an int constant since its type changed to UBYTE.
- if (castOp1Oper == GT_CNS_INT)
- {
- castOp1->gtIntCon.gtIconVal = (UINT8)castOp1->gtIntCon.gtIconVal;
- }
-
+ // TODO-Cleanup: we're within "if (CheckImmedAndMakeContained(tree, op2))", so isn't
+ // the following condition always true?
if (op2->isContainedIntOrIImmed())
{
ssize_t val = (ssize_t)op2->AsIntConCommon()->IconValue();
if (val >= 0 && val <= 255)
{
+ GenTreePtr removeTreeNode = op1;
+ tree->gtOp.gtOp1 = castOp1;
+ op1 = castOp1;
+ castOp1->gtType = TYP_UBYTE;
+
+ // trim down the value if castOp1 is an int constant since its type changed to UBYTE.
+ if (castOp1Oper == GT_CNS_INT)
+ {
+ castOp1->gtIntCon.gtIconVal = (UINT8)castOp1->gtIntCon.gtIconVal;
+ }
+
op2->gtType = TYP_UBYTE;
tree->gtFlags |= GTF_UNSIGNED;
@@ -3222,18 +3687,26 @@ void Lowering::LowerCmp(GenTreePtr tree)
MakeSrcContained(tree, op1);
op1IsMadeContained = true;
}
- }
- }
- BlockRange().Remove(removeTreeNode);
+ BlockRange().Remove(removeTreeNode);
+
+ // We've changed the type on op1 to TYP_UBYTE, but we already processed that node.
+ // We need to go back and mark it byteable.
+ // TODO-Cleanup: it might be better to move this out of the TreeNodeInfoInit pass to
+ // the earlier "lower" pass, in which case the byteable check would just fall out.
+ // But that is quite complex!
+ TreeNodeInfoInitCheckByteable(op1);
+
#ifdef DEBUG
- if (comp->verbose)
- {
- printf("LowerCmp: Removing a GT_CAST to TYP_UBYTE and changing castOp1->gtType to "
- "TYP_UBYTE\n");
- comp->gtDispTreeRange(BlockRange(), tree);
- }
+ if (comp->verbose)
+ {
+ printf("TreeNodeInfoInitCmp: Removing a GT_CAST to TYP_UBYTE and changing "
+ "castOp1->gtType to TYP_UBYTE\n");
+ comp->gtDispTreeRange(BlockRange(), tree);
+ }
#endif
+ }
+ }
}
}
@@ -3241,6 +3714,41 @@ void Lowering::LowerCmp(GenTreePtr tree)
if (!op1IsMadeContained)
{
SetRegOptional(op1);
+
+ // If op1 codegen sets ZF and SF flags and ==/!= against
+ // zero, we don't need to generate test instruction,
+ // provided we don't have another GenTree node between op1
+ // and tree that could potentially modify flags.
+ //
+ // TODO-CQ: right now the below peep is inexpensive and
+ // gets the benefit in most of cases because in majority
+ // of cases op1, op2 and tree would be in that order in
+ // execution. In general we should be able to check that all
+ // the nodes that come after op1 in execution order do not
+ // modify the flags so that it is safe to avoid generating a
+ // test instruction. Such a check requires that on each
+ // GenTree node we need to set the info whether its codegen
+ // will modify flags.
+ //
+ // TODO-CQ: We can optimize compare against zero in the
+ // following cases by generating the branch as indicated
+ // against each case.
+ // 1) unsigned compare
+ // < 0 - always FALSE
+ // <= 0 - ZF=1 and jne
+ // > 0 - ZF=0 and je
+ // >= 0 - always TRUE
+ //
+ // 2) signed compare
+ // < 0 - SF=1 and js
+ // >= 0 - SF=0 and jns
+ if (isEqualityCompare && op1->gtSetZSFlags() && op2->IsIntegralConst(0) && (op1->gtNext == op2) &&
+ (op2->gtNext == tree))
+ {
+ // Require codegen of op1 to set the flags.
+ assert(!op1->gtSetFlags());
+ op1->gtFlags |= GTF_SET_FLAGS;
+ }
}
}
}
@@ -3255,10 +3763,17 @@ void Lowering::LowerCmp(GenTreePtr tree)
{
MakeSrcContained(tree, op1);
}
+ else if (op1->IsCnsIntOrI())
+ {
+ // TODO-CQ: We should be able to support swapping op1 and op2 to generate cmp reg, imm,
+ // but there is currently an assert in CodeGen::genCompareInt().
+ // https://github.com/dotnet/coreclr/issues/7270
+ SetRegOptional(op2);
+ }
else
{
// One of op1 or op2 could be marked as reg optional
- // to indicate that codgen can still generate code
+ // to indicate that codegen can still generate code
// if one of them is on stack.
SetRegOptional(PreferredRegOptionalOperand(tree));
}
@@ -3318,7 +3833,6 @@ void Lowering::LowerCast(GenTree* tree)
var_types dstType = tree->CastToType();
var_types srcType = op1->TypeGet();
var_types tmpType = TYP_UNDEF;
- bool srcUns = false;
// force the srcType to unsigned if GT_UNSIGNED flag is set
if (tree->gtFlags & GTF_UNSIGNED)
@@ -3849,6 +4363,20 @@ bool Lowering::SetStoreIndOpCountsIfRMWMemOp(GenTreePtr storeInd)
}
m_lsra->clearOperandCounts(indirCandidateChild);
+#ifdef _TARGET_X86_
+ if (varTypeIsByte(storeInd))
+ {
+ // If storeInd is of TYP_BYTE, set indirOpSources to byteable registers.
+ bool containedNode = indirOpSource->gtLsraInfo.dstCount == 0;
+ if (!containedNode)
+ {
+ regMaskTP regMask = indirOpSource->gtLsraInfo.getSrcCandidates(m_lsra);
+ assert(regMask != RBM_NONE);
+ indirOpSource->gtLsraInfo.setSrcCandidates(m_lsra, regMask & ~RBM_NON_BYTE_REGS);
+ }
+ }
+#endif
+
return true;
}
@@ -3858,8 +4386,11 @@ bool Lowering::SetStoreIndOpCountsIfRMWMemOp(GenTreePtr storeInd)
*/
void Lowering::SetMulOpCounts(GenTreePtr tree)
{
+#if defined(_TARGET_X86_)
+ assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI || tree->OperGet() == GT_MUL_LONG);
+#else
assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI);
-
+#endif
TreeNodeInfo* info = &(tree->gtLsraInfo);
info->srcCount = 2;
@@ -3900,13 +4431,18 @@ void Lowering::SetMulOpCounts(GenTreePtr tree)
GenTreeIntConCommon* imm = nullptr;
GenTreePtr other = nullptr;
- // There are three forms of x86 multiply:
- // one-op form: RDX:RAX = RAX * r/m
- // two-op form: reg *= r/m
- // three-op form: reg = r/m * imm
+// There are three forms of x86 multiply:
+// one-op form: RDX:RAX = RAX * r/m
+// two-op form: reg *= r/m
+// three-op form: reg = r/m * imm
- // This special widening 32x32->64 MUL is not used on x64
- assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
+// This special widening 32x32->64 MUL is not used on x64
+#if defined(_TARGET_X86_)
+ if (tree->OperGet() != GT_MUL_LONG)
+#endif
+ {
+ assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
+ }
// Multiply should never be using small types
assert(!varTypeIsSmall(tree->TypeGet()));
@@ -3924,12 +4460,21 @@ void Lowering::SetMulOpCounts(GenTreePtr tree)
info->setDstCandidates(m_lsra, RBM_RAX);
hasImpliedFirstOperand = true;
}
- else if (tree->gtOper == GT_MULHI)
+ else if (tree->OperGet() == GT_MULHI)
+ {
+ // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
+ // upper 32 bits of the result set the destination candidate to REG_RDX.
+ info->setDstCandidates(m_lsra, RBM_RDX);
+ hasImpliedFirstOperand = true;
+ }
+#if defined(_TARGET_X86_)
+ else if (tree->OperGet() == GT_MUL_LONG)
{
// have to use the encoding:RDX:RAX = RAX * rm
info->setDstCandidates(m_lsra, RBM_RAX);
hasImpliedFirstOperand = true;
}
+#endif
else if (IsContainableImmed(tree, op2) || IsContainableImmed(tree, op1))
{
if (IsContainableImmed(tree, op2))
@@ -4187,6 +4732,71 @@ GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree)
return preferredOp;
}
+#ifdef _TARGET_X86_
+//------------------------------------------------------------------------
+// ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for
+// various reasons
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// If we need to exclude non-byteable registers
+//
+bool Lowering::ExcludeNonByteableRegisters(GenTree* tree)
+{
+ // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
+ // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
+ // value. In this case we need to exclude esi/edi from the src candidates of op2.
+ if (varTypeIsByte(tree))
+ {
+ return true;
+ }
+ // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
+ else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType()))
+ {
+ return true;
+ }
+ else if (tree->OperIsCompare())
+ {
+ GenTree* op1 = tree->gtGetOp1();
+ GenTree* op2 = tree->gtGetOp2();
+
+ // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
+ // ubyte as the result of comparison and if the result needs to be materialized into a reg
+ // simply zero extend it to TYP_INT size. Here is an example of generated code:
+ // cmp dl, byte ptr[addr mode]
+ // movzx edx, dl
+ if (varTypeIsByte(op1) && varTypeIsByte(op2))
+ {
+ return true;
+ }
+ // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses
+ // ubyte as the result of the comparison and if the result needs to be materialized into a reg
+ // simply zero extend it to TYP_INT size.
+ else if (varTypeIsByte(op1) && op2->IsCnsIntOrI())
+ {
+ return true;
+ }
+ // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses
+ // ubyte as the result of the comparison and if the result needs to be materialized into a reg
+ // simply zero extend it to TYP_INT size.
+ else if (op1->IsCnsIntOrI() && varTypeIsByte(op2))
+ {
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ else
+ {
+ return false;
+ }
+}
+#endif // _TARGET_X86_
+
#endif // _TARGET_XARCH_
#endif // !LEGACY_BACKEND