From db20f3f1bb8595633a7e16c8900fd401a453a6b5 Mon Sep 17 00:00:00 2001 From: Jiyoung Yun Date: Tue, 27 Dec 2016 16:46:08 +0900 Subject: Imported Upstream version 1.0.0.9127 --- src/jit/lowerxarch.cpp | 1104 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 857 insertions(+), 247 deletions(-) (limited to 'src/jit/lowerxarch.cpp') diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index 6f98eb6661..589cef482e 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -77,7 +77,7 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) // InitBlk MakeSrcContained(storeLoc, op1); } - else if (storeLoc->TypeGet() == TYP_SIMD12) + else if ((storeLoc->TypeGet() == TYP_SIMD12) && (storeLoc->OperGet() == GT_STORE_LCL_FLD)) { // Need an additional register to extract upper 4 bytes of Vector3. info->internalFloatCount = 1; @@ -177,6 +177,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) break; case GT_LCL_FLD: + case GT_LCL_VAR: info->srcCount = 0; info->dstCount = 1; @@ -185,9 +186,9 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) if (tree->TypeGet() == TYP_SIMD12) { // We need an internal register different from targetReg in which 'tree' produces its result - // because both targetReg and internal reg will be in use at the same time. This is achieved - // by asking for two internal registers. - info->internalFloatCount = 2; + // because both targetReg and internal reg will be in use at the same time. + info->internalFloatCount = 1; + info->isInternalRegDelayFree = true; info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs()); } #endif @@ -195,7 +196,16 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) case GT_STORE_LCL_FLD: case GT_STORE_LCL_VAR: - info->srcCount = 1; +#ifdef _TARGET_X86_ + if (tree->gtGetOp1()->OperGet() == GT_LONG) + { + info->srcCount = 2; + } + else +#endif // _TARGET_X86_ + { + info->srcCount = 1; + } info->dstCount = 0; LowerStoreLoc(tree->AsLclVarCommon()); break; @@ -242,6 +252,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) break; case GT_LIST: + case GT_FIELD_LIST: case GT_ARGPLACE: case GT_NO_OP: case GT_START_NONGC: @@ -319,9 +330,87 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) break; case GT_JTRUE: + { + info->srcCount = 0; + info->dstCount = 0; + + GenTree* cmp = tree->gtGetOp1(); + l->clearDstCount(cmp); + +#ifdef FEATURE_SIMD + // Say we have the following IR + // simdCompareResult = GT_SIMD((In)Equality, v1, v2) + // integerCompareResult = GT_EQ/NE(simdCompareResult, true/false) + // GT_JTRUE(integerCompareResult) + // + // In this case we don't need to generate code for GT_EQ_/NE, since SIMD (In)Equality + // intrinsic would set or clear Zero flag. + + genTreeOps cmpOper = cmp->OperGet(); + if (cmpOper == GT_EQ || cmpOper == GT_NE) + { + GenTree* cmpOp1 = cmp->gtGetOp1(); + GenTree* cmpOp2 = cmp->gtGetOp2(); + + if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1))) + { + // clear dstCount on SIMD node to indicate that + // result doesn't need to be materialized into a register. + l->clearOperandCounts(cmp); + l->clearDstCount(cmpOp1); + l->clearOperandCounts(cmpOp2); + + // Codegen of SIMD (in)Equality uses target integer reg + // only for setting flags. Target reg is not needed on AVX + // when comparing against Vector Zero. In all other cases + // we need to reserve an int type internal register, since we + // have cleared dstCount. + if (compiler->canUseAVX() && cmpOp1->gtGetOp2()->IsIntegralConstVector(0)) + { + // We don't need an internal register,since we use vptest + // for setting flags. + } + else + { + ++(cmpOp1->gtLsraInfo.internalIntCount); + regMaskTP internalCandidates = cmpOp1->gtLsraInfo.getInternalCandidates(l); + internalCandidates |= l->allRegs(TYP_INT); + cmpOp1->gtLsraInfo.setInternalCandidates(l, internalCandidates); + } + + // We would have to reverse compare oper in the following cases: + // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it. + // Therefore, if compare oper is == or != against false(0), we will + // be checking opposite of what is required. + // + // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it. + // Therefore, if compare oper is == or != against true(1), we will + // be checking opposite of what is required. + GenTreeSIMD* simdNode = cmpOp1->AsSIMD(); + if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) + { + if (cmpOp2->IsIntegralConst(0)) + { + cmp->SetOper(GenTree::ReverseRelop(cmpOper)); + } + } + else + { + assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality); + if (cmpOp2->IsIntegralConst(1)) + { + cmp->SetOper(GenTree::ReverseRelop(cmpOper)); + } + } + } + } +#endif // FEATURE_SIMD + } + break; + + case GT_JCC: info->srcCount = 0; info->dstCount = 0; - l->clearDstCount(tree->gtOp.gtOp1); break; case GT_JMP: @@ -436,6 +525,9 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) case GT_MUL: case GT_MULHI: +#if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND) + case GT_MUL_LONG: +#endif SetMulOpCounts(tree); break; @@ -478,6 +570,11 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) info->internalFloatCount = 1; info->setInternalCandidates(l, l->internalFloatRegCandidates()); } + else + { + // Codegen of this tree node sets ZF and SF flags. + tree->gtFlags |= GTF_ZSF_SET; + } break; case GT_NOT: @@ -490,6 +587,10 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) case GT_RSZ: case GT_ROL: case GT_ROR: +#ifdef _TARGET_X86_ + case GT_LSH_HI: + case GT_RSH_LO: +#endif TreeNodeInfoInitShiftRotate(tree); break; @@ -499,7 +600,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) case GT_LE: case GT_GE: case GT_GT: - LowerCmp(tree); + TreeNodeInfoInitCmp(tree); break; case GT_CKFINITE: @@ -542,10 +643,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) } break; -#ifdef _TARGET_X86_ - case GT_OBJ: - NYI_X86("GT_OBJ"); -#elif !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) +#if !defined(FEATURE_PUT_STRUCT_ARG_STK) case GT_OBJ: #endif case GT_BLK: @@ -556,11 +654,11 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) info->dstCount = 0; break; -#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING +#ifdef FEATURE_PUT_STRUCT_ARG_STK case GT_PUTARG_STK: - TreeNodeInfoInitPutArgStk(tree); + TreeNodeInfoInitPutArgStk(tree->AsPutArgStk()); break; -#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING +#endif // FEATURE_PUT_STRUCT_ARG_STK case GT_STORE_BLK: case GT_STORE_OBJ: @@ -568,6 +666,12 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) TreeNodeInfoInitBlockStore(tree->AsBlk()); break; + case GT_INIT_VAL: + // Always a passthrough of its child's value. + info->srcCount = 0; + info->dstCount = 0; + break; + case GT_LCLHEAP: TreeNodeInfoInitLclHeap(tree); break; @@ -634,14 +738,20 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) case GT_ARR_OFFSET: // This consumes the offset, if any, the arrObj and the effective index, // and produces the flattened offset for this dimension. - info->srcCount = 3; - info->dstCount = 1; - info->internalIntCount = 1; + info->srcCount = 3; + info->dstCount = 1; + // we don't want to generate code for this if (tree->gtArrOffs.gtOffset->IsIntegralConst(0)) { MakeSrcContained(tree, tree->gtArrOffs.gtOffset); } + else + { + // Here we simply need an internal register, which must be different + // from any of the operand's registers, but may be the same as targetReg. + info->internalIntCount = 1; + } break; case GT_LEA: @@ -725,15 +835,9 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) #endif case GT_CLS_VAR: - info->srcCount = 0; - // GT_CLS_VAR, by the time we reach the backend, must always - // be a pure use. - // It will produce a result of the type of the - // node, and use an internal register for the address. - - info->dstCount = 1; - assert((tree->gtFlags & (GTF_VAR_DEF | GTF_VAR_USEASG | GTF_VAR_USEDEF)) == 0); - info->internalIntCount = 1; + // These nodes are eliminated by rationalizer. + JITDUMP("Unexpected node %s in Lower.\n", GenTree::NodeName(tree->OperGet())); + unreached(); break; } // end switch (tree->OperGet()) @@ -813,27 +917,36 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) } } + TreeNodeInfoInitCheckByteable(tree); + + // We need to be sure that we've set info->srcCount and info->dstCount appropriately + assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT)); +} + +//------------------------------------------------------------------------ +// TreeNodeInfoInitCheckByteable: Check the tree to see if "byte-able" registers are +// required, and set the tree node info accordingly. +// +// Arguments: +// tree - The node of interest +// +// Return Value: +// None. +// +void Lowering::TreeNodeInfoInitCheckByteable(GenTree* tree) +{ #ifdef _TARGET_X86_ + LinearScan* l = m_lsra; + TreeNodeInfo* info = &(tree->gtLsraInfo); + // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands // if the tree node is a byte type. // - // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr' - // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT - // value. In this case we need to exclude esi/edi from the src candidates of op2. - // - // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool. - // - // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses - // ubyte as the result of comparison and if the result needs to be materialized into a reg - // simply zero extend it to TYP_INT size. Here is an example of generated code: - // cmp dl, byte ptr[addr mode] - // movzx edx, dl - // // Though this looks conservative in theory, in practice we could not think of a case where // the below logic leads to conservative register specification. In future when or if we find // one such case, this logic needs to be fine tuned for that case(s). - if (varTypeIsByte(tree) || ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType())) || - (tree->OperIsCompare() && varTypeIsByte(tree->gtGetOp1()) && varTypeIsByte(tree->gtGetOp2()))) + + if (ExcludeNonByteableRegisters(tree)) { regMaskTP regMask; if (info->dstCount > 0) @@ -870,9 +983,6 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) } } #endif //_TARGET_X86_ - - // We need to be sure that we've set info->srcCount and info->dstCount appropriately - assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT)); } //------------------------------------------------------------------------ @@ -1028,6 +1138,31 @@ void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree) GenTreePtr shiftBy = tree->gtOp.gtOp2; GenTreePtr source = tree->gtOp.gtOp1; +#ifdef _TARGET_X86_ + // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that + // we can have a three operand form. Increment the srcCount. + if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO) + { + assert(source->OperGet() == GT_LONG); + + info->srcCount++; + + if (tree->OperGet() == GT_LSH_HI) + { + GenTreePtr sourceLo = source->gtOp.gtOp1; + sourceLo->gtLsraInfo.isDelayFree = true; + } + else + { + GenTreePtr sourceHi = source->gtOp.gtOp2; + sourceHi->gtLsraInfo.isDelayFree = true; + } + + source->gtLsraInfo.hasDelayFreeSrc = true; + info->hasDelayFreeSrc = true; + } +#endif + // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off) // We will allow whatever can be encoded - hope you know what you are doing. if (!IsContainableImmed(tree, shiftBy) || (shiftBy->gtIntConCommon.IconValue() > 255) || @@ -1040,6 +1175,17 @@ void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree) else { MakeSrcContained(tree, shiftBy); + + // Note that Rotate Left/Right instructions don't set ZF and SF flags. + // + // If the operand being shifted is 32-bits then upper three bits are masked + // by hardware to get actual shift count. Similarly for 64-bit operands + // shift count is narrowed to [0..63]. If the resulting shift count is zero, + // then shift operation won't modify flags. + // + // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0) + // if the shift count is known to be non-zero and in the range depending on the + // operand size. } } @@ -1088,6 +1234,12 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call) assert(ctrlExpr == nullptr); assert(call->gtCallAddr != nullptr); ctrlExpr = call->gtCallAddr; + +#ifdef _TARGET_X86_ + // Fast tail calls aren't currently supported on x86, but if they ever are, the code + // below that handles indirect VSD calls will need to be fixed. + assert(!call->IsFastTailCall() || !call->IsVirtualStub()); +#endif // _TARGET_X86_ } // set reg requirements on call target represented as control sequence. @@ -1103,7 +1255,24 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call) // computed into a register. if (!call->IsFastTailCall()) { - if (ctrlExpr->isIndir()) +#ifdef _TARGET_X86_ + // On x86, we need to generate a very specific pattern for indirect VSD calls: + // + // 3-byte nop + // call dword ptr [eax] + // + // Where EAX is also used as an argument to the stub dispatch helper. Make + // sure that the call target address is computed into EAX in this case. + if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT)) + { + assert(ctrlExpr->isIndir()); + + ctrlExpr->gtGetOp1()->gtLsraInfo.setSrcCandidates(l, RBM_VIRTUAL_STUB_TARGET); + MakeSrcContained(call, ctrlExpr); + } + else +#endif // _TARGET_X86_ + if (ctrlExpr->isIndir()) { MakeSrcContained(call, ctrlExpr); } @@ -1191,7 +1360,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call) // First, count reg args for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext()) { - assert(list->IsList()); + assert(list->OperIsList()); GenTreePtr argNode = list->Current(); @@ -1206,7 +1375,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call) argNode->gtLsraInfo.srcCount = 1; argNode->gtLsraInfo.dstCount = 0; -#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING +#ifdef FEATURE_PUT_STRUCT_ARG_STK // If the node is TYP_STRUCT and it is put on stack with // putarg_stk operation, we consume and produce no registers. // In this case the embedded Obj node should not produce @@ -1218,7 +1387,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call) argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0; argNode->gtLsraInfo.srcCount = 0; } -#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING +#endif // FEATURE_PUT_STRUCT_ARG_STK continue; } @@ -1248,7 +1417,7 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call) // If the struct arg is wrapped in CPYBLK the type of the param will be TYP_VOID. // Use the curArgTabEntry's isStruct to get whether the param is a struct. - if (varTypeIsStruct(argNode) FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY(|| curArgTabEntry->isStruct)) + if (varTypeIsStruct(argNode) PUT_STRUCT_ARG_STK_ONLY(|| curArgTabEntry->isStruct)) { unsigned originalSize = 0; LclVarDsc* varDsc = nullptr; @@ -1270,16 +1439,16 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call) { originalSize = genTypeSize(argNode->gtType); } - else if (argNode->gtOper == GT_LIST) + else if (argNode->gtOper == GT_FIELD_LIST) { originalSize = 0; // There could be up to 2 PUTARG_REGs in the list - GenTreeArgList* argListPtr = argNode->AsArgList(); - unsigned iterationNum = 0; - for (; argListPtr; argListPtr = argListPtr->Rest()) + GenTreeFieldList* fieldListPtr = argNode->AsFieldList(); + unsigned iterationNum = 0; + for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest()) { - GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1; + GenTreePtr putArgRegNode = fieldListPtr->Current(); assert(putArgRegNode->gtOper == GT_PUTARG_REG); if (iterationNum == 0) @@ -1509,7 +1678,7 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) } m_lsra->clearOperandCounts(source); } - else if (!source->OperIsSIMD()) + else if (!source->IsMultiRegCall() && !source->OperIsSIMD()) { assert(source->IsLocal()); MakeSrcContained(blkNode, source); @@ -1519,7 +1688,11 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) if (isInitBlk) { GenTree* initVal = source; - srcAddrOrFill = source; + if (initVal->OperIsInitVal()) + { + initVal = initVal->gtGetOp1(); + } + srcAddrOrFill = initVal; // If we have an InitBlk with constant block size we can optimize several ways: // a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes // we use rep stosb since this reduces the register pressure in LSRA and we have @@ -1571,8 +1744,23 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) // a pack of 16 init value constants. blkNode->gtLsraInfo.internalFloatCount = 1; blkNode->gtLsraInfo.setInternalCandidates(l, l->internalFloatRegCandidates()); + if ((fill == 0) && ((size & 0xf) == 0)) + { + MakeSrcContained(blkNode, source); + } } blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; + +#ifdef _TARGET_X86_ + if ((size & 1) != 0) + { + // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing + // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this + // when unrolling, so only allow byteable registers as the source value. (We could + // consider just using BlkOpKindRepInstr instead.) + sourceRegMask = RBM_BYTE_REGS; + } +#endif // _TARGET_X86_ } else { @@ -1825,7 +2013,7 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) } } -#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING +#ifdef FEATURE_PUT_STRUCT_ARG_STK //------------------------------------------------------------------------ // TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK. // @@ -1835,44 +2023,219 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) // Return Value: // None. // -void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree) +void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk) { - TreeNodeInfo* info = &(tree->gtLsraInfo); + TreeNodeInfo* info = &(putArgStk->gtLsraInfo); LinearScan* l = m_lsra; - if (tree->TypeGet() != TYP_STRUCT) +#ifdef _TARGET_X86_ + if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST) + { + putArgStk->gtNumberReferenceSlots = 0; + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Invalid; + + GenTreeFieldList* fieldList = putArgStk->gtOp1->AsFieldList(); + + // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order + // of uses is visible to LSRA. + unsigned fieldCount = 0; + GenTreeFieldList* head = nullptr; + for (GenTreeFieldList *current = fieldList, *next; current != nullptr; current = next) + { + next = current->Rest(); + + // First, insert the field node into the sorted list. + GenTreeFieldList* prev = nullptr; + for (GenTreeFieldList* cursor = head;; cursor = cursor->Rest()) + { + // If the offset of the current list node is greater than the offset of the cursor or if we have + // reached the end of the list, insert the current node before the cursor and terminate. + if ((cursor == nullptr) || (current->gtFieldOffset > cursor->gtFieldOffset)) + { + if (prev == nullptr) + { + assert(cursor == head); + head = current; + } + else + { + prev->Rest() = current; + } + + current->Rest() = cursor; + break; + } + } + + fieldCount++; + } + + info->srcCount = fieldCount; + info->dstCount = 0; + + // In theory, the upper bound for the size of a field list is 8: these constructs only appear when passing the + // collection of lclVars that represent the fields of a promoted struct lclVar, and we do not promote struct + // lclVars with more than 4 fields. If each of these lclVars is of type long, decomposition will split the + // corresponding field list nodes in two, giving an upper bound of 8. + // + // The reason that this is important is that the algorithm we use above to sort the field list is O(N^2): if + // the maximum size of a field list grows significantly, we will need to reevaluate it. + assert(fieldCount <= 8); + + // The sort above may have changed which node is at the head of the list. Update the PUTARG_STK node if + // necessary. + if (head != fieldList) + { + head->gtFlags |= GTF_FIELD_LIST_HEAD; + fieldList->gtFlags &= ~GTF_FIELD_LIST_HEAD; + +#ifdef DEBUG + head->gtSeqNum = fieldList->gtSeqNum; +#endif // DEBUG + + head->gtLsraInfo = fieldList->gtLsraInfo; + head->gtClearReg(comp); + + BlockRange().InsertAfter(fieldList, head); + BlockRange().Remove(fieldList); + + fieldList = head; + putArgStk->gtOp1 = fieldList; + } + + // Now that the fields have been sorted, initialize the LSRA info. + bool allFieldsAreSlots = true; + bool needsByteTemp = false; + unsigned prevOffset = putArgStk->getArgSize(); + for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest()) + { + GenTree* const fieldNode = current->Current(); + const var_types fieldType = fieldNode->TypeGet(); + const unsigned fieldOffset = current->gtFieldOffset; + assert(fieldType != TYP_LONG); + + // For x86 we must mark all integral fields as contained or reg-optional, and handle them + // accordingly in code generation, since we may have up to 8 fields, which cannot all be in + // registers to be consumed atomically by the call. + if (varTypeIsIntegralOrI(fieldNode)) + { + if (fieldNode->OperGet() == GT_LCL_VAR) + { + LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]); + if (varDsc->lvTracked && !varDsc->lvDoNotEnregister) + { + SetRegOptional(fieldNode); + } + else + { + MakeSrcContained(putArgStk, fieldNode); + } + } + else if (fieldNode->IsIntCnsFitsInI32()) + { + MakeSrcContained(putArgStk, fieldNode); + } + else + { + // For the case where we cannot directly push the value, if we run out of registers, + // it would be better to defer computation until we are pushing the arguments rather + // than spilling, but this situation is not all that common, as most cases of promoted + // structs do not have a large number of fields, and of those most are lclVars or + // copy-propagated constants. + SetRegOptional(fieldNode); + } + } + else + { + assert(varTypeIsFloating(fieldNode)); + } + + // We can treat as a slot any field that is stored at a slot boundary, where the previous + // field is not in the same slot. (Note that we store the fields in reverse order.) + const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4); + if (!fieldIsSlot) + { + allFieldsAreSlots = false; + if (varTypeIsByte(fieldType)) + { + // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes + // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will + // need a byte-addressable register for the store. We will enforce this requirement on an internal + // register, which we can use to copy multiple byte values. + needsByteTemp = true; + } + } + + if (varTypeIsGC(fieldType)) + { + putArgStk->gtNumberReferenceSlots++; + } + + prevOffset = fieldOffset; + } + + // Set the copy kind. + // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should + // adjust the stack once for those fields. The latter is really best done in code generation, but + // this tuning should probably be undertaken as a whole. + // Also, if there are floating point fields, it may be better to use the "Unroll" mode + // of copying the struct as a whole, if the fields are not register candidates. + if (allFieldsAreSlots) + { + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots; + } + else + { + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; + // If any of the fields cannot be stored with an actual push, we may need a temporary + // register to load the value before storing it to the stack location. + info->internalIntCount = 1; + regMaskTP regMask = l->allRegs(TYP_INT); + if (needsByteTemp) + { + regMask &= ~RBM_NON_BYTE_REGS; + } + info->setInternalCandidates(l, regMask); + } + return; + } +#endif // _TARGET_X86_ + +#if defined(FEATURE_SIMD) && defined(_TARGET_X86_) + // For PutArgStk of a TYP_SIMD12, we need an extra register. + if (putArgStk->TypeGet() == TYP_SIMD12) { - TreeNodeInfoInitSimple(tree); + info->srcCount = putArgStk->gtOp1->gtLsraInfo.dstCount; + info->dstCount = 0; + info->internalFloatCount = 1; + info->setInternalCandidates(l, l->allSIMDRegs()); return; } +#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_) - GenTreePutArgStk* putArgStkTree = tree->AsPutArgStk(); + if (putArgStk->TypeGet() != TYP_STRUCT) + { + TreeNodeInfoInitSimple(putArgStk); + return; + } - GenTreePtr dst = tree; - GenTreePtr src = tree->gtOp.gtOp1; + GenTreePtr dst = putArgStk; + GenTreePtr src = putArgStk->gtOp1; GenTreePtr srcAddr = nullptr; + bool haveLocalAddr = false; if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND)) { srcAddr = src->gtOp.gtOp1; + assert(srcAddr != nullptr); + haveLocalAddr = srcAddr->OperIsLocalAddr(); } else { - assert(varTypeIsSIMD(tree)); - } - info->srcCount = src->gtLsraInfo.dstCount; - - // If this is a stack variable address, - // make the op1 contained, so this way - // there is no unnecessary copying between registers. - // To avoid assertion, increment the parent's source. - // It is recovered below. - bool haveLocalAddr = ((srcAddr != nullptr) && (srcAddr->OperIsLocalAddr())); - if (haveLocalAddr) - { - info->srcCount += 1; + assert(varTypeIsSIMD(putArgStk)); } + info->srcCount = src->gtLsraInfo.dstCount; info->dstCount = 0; // In case of a CpBlk we could use a helper call. In case of putarg_stk we @@ -1884,7 +2247,7 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree) // This threshold will decide from using the helper or let the JIT decide to inline // a code sequence of its choice. ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT); - ssize_t size = putArgStkTree->gtNumSlots * TARGET_POINTER_SIZE; + ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE; // TODO-X86-CQ: The helper call either is not supported on x86 or required more work // (I don't know which). @@ -1892,7 +2255,7 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree) // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2. // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of // our framework assemblies, so this is the main code generation scheme we'll use. - if (size <= CPBLK_UNROLL_LIMIT && putArgStkTree->gtNumberReferenceSlots == 0) + if (size <= CPBLK_UNROLL_LIMIT && putArgStk->gtNumberReferenceSlots == 0) { // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg. // @@ -1913,46 +2276,62 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTree* tree) info->setInternalCandidates(l, regMask); } +#ifdef _TARGET_X86_ + if (size >= 8) +#else // !_TARGET_X86_ if (size >= XMM_REGSIZE_BYTES) +#endif // !_TARGET_X86_ { - // If we have a buffer larger than XMM_REGSIZE_BYTES, - // reserve an XMM register to use it for a + // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux, + // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a // series of 16-byte loads and stores. info->internalFloatCount = 1; info->addInternalCandidates(l, l->internalFloatRegCandidates()); } - if (haveLocalAddr) +#ifdef _TARGET_X86_ + if (size < XMM_REGSIZE_BYTES) { - MakeSrcContained(putArgStkTree, srcAddr); + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; } - - // If src or dst are on stack, we don't have to generate the address into a register - // because it's just some constant+SP - putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindUnroll; + else +#endif // _TARGET_X86_ + { + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll; + } + } +#ifdef _TARGET_X86_ + else if (putArgStk->gtNumberReferenceSlots != 0) + { + // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update + // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions. + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; } +#endif // _TARGET_X86_ else { info->internalIntCount += 3; info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI)); - if (haveLocalAddr) - { - MakeSrcContained(putArgStkTree, srcAddr); - } - putArgStkTree->gtPutArgStkKind = GenTreePutArgStk::PutArgStkKindRepInstr; + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr; } // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree. - MakeSrcContained(putArgStkTree, src); + MakeSrcContained(putArgStk, src); - // Balance up the inc above. if (haveLocalAddr) { - info->srcCount -= 1; + // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary + // copies. + // + // To avoid an assertion in MakeSrcContained, increment the parent's source count beforehand and decrement it + // afterwards. + info->srcCount++; + MakeSrcContained(putArgStk, srcAddr); + info->srcCount--; } } -#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING +#endif // FEATURE_PUT_STRUCT_ARG_STK //------------------------------------------------------------------------ // TreeNodeInfoInitLclHeap: Set the NodeInfo for a GT_LCLHEAP. @@ -1976,13 +2355,17 @@ void Lowering::TreeNodeInfoInitLclHeap(GenTree* tree) // Here '-' means don't care. // // Size? Init Memory? # temp regs - // 0 - 0 - // const and <=6 reg words - 0 - // const and >6 reg words Yes 0 + // 0 - 0 (returns 0) + // const and <=6 reg words - 0 (pushes '0') + // const and >6 reg words Yes 0 (pushes '0') // const and =PageSize No 2 - // Non-const Yes 0 - // Non-const No 2 + // (x86:tmpReg for sutracting from esp) + // const and >=PageSize No 2 (regCnt and tmpReg for subtracing from sp) + // Non-const Yes 0 (regCnt=targetReg and pushes '0') + // Non-const No 2 (regCnt and tmpReg for subtracting from sp) + // + // Note: Here we don't need internal register to be different from targetReg. + // Rather, require it to be different from operand's reg. GenTreePtr size = tree->gtOp.gtOp1; if (size->IsCnsIntOrI()) @@ -2121,6 +2504,9 @@ void Lowering::TreeNodeInfoInitLogicalOp(GenTree* tree) // as reg optional. SetRegOptionalForBinOp(tree); } + + // Codegen of this tree node sets ZF and SF flags. + tree->gtFlags |= GTF_ZSF_SET; } //------------------------------------------------------------------------ @@ -2189,15 +2575,40 @@ void Lowering::TreeNodeInfoInitModDiv(GenTree* tree) info->setDstCandidates(l, RBM_RAX); } - // If possible would like to have op1 in RAX to avoid a register move - op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX); + bool op2CanBeRegOptional = true; +#ifdef _TARGET_X86_ + if (op1->OperGet() == GT_LONG) + { + // To avoid reg move would like to have op1's low part in RAX and high part in RDX. + GenTree* loVal = op1->gtGetOp1(); + GenTree* hiVal = op1->gtGetOp2(); + + // Src count is actually 3, so increment. + assert(op2->IsCnsIntOrI()); + assert(tree->OperGet() == GT_UMOD); + info->srcCount++; + op2CanBeRegOptional = false; + + // This situation also requires an internal register. + info->internalIntCount = 1; + info->setInternalCandidates(l, l->allRegs(TYP_INT)); + + loVal->gtLsraInfo.setSrcCandidates(l, RBM_EAX); + hiVal->gtLsraInfo.setSrcCandidates(l, RBM_EDX); + } + else +#endif + { + // If possible would like to have op1 in RAX to avoid a register move + op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX); + } // divisor can be an r/m, but the memory indirection must be of the same size as the divide if (op2->isMemoryOp() && (op2->TypeGet() == tree->TypeGet())) { MakeSrcContained(tree, op2); } - else + else if (op2CanBeRegOptional) { op2->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX)); @@ -2298,12 +2709,13 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) info->dstCount = 1; switch (simdTree->gtSIMDIntrinsicID) { + GenTree* op1; GenTree* op2; case SIMDIntrinsicInit: { info->srcCount = 1; - GenTree* op1 = tree->gtOp.gtOp1; + op1 = tree->gtOp.gtOp1; // This sets all fields of a SIMD struct to the given value. // Mark op1 as contained if it is either zero or int constant of all 1's, @@ -2377,7 +2789,8 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) info->srcCount = 2; // SSE2 32-bit integer multiplication requires two temp regs - if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT) + if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT && + comp->getSIMDInstructionSet() == InstructionSet_SSE2) { info->internalFloatCount = 2; info->setInternalCandidates(lsra, lsra->allSIMDRegs()); @@ -2406,38 +2819,78 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) case SIMDIntrinsicOpEquality: case SIMDIntrinsicOpInEquality: - // Need two SIMD registers as scratch. - // See genSIMDIntrinsicRelOp() for details on code sequence generate and - // the need for two scratch registers. - info->srcCount = 2; - info->internalFloatCount = 2; - info->setInternalCandidates(lsra, lsra->allSIMDRegs()); + info->srcCount = 2; + + // On SSE4/AVX, we can generate optimal code for (in)equality + // against zero using ptest. We can safely do the this optimization + // for integral vectors but not for floating-point for the reason + // that we have +0.0 and -0.0 and +0.0 == -0.0 + op2 = tree->gtGetOp2(); + if ((comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0)) + { + MakeSrcContained(tree, op2); + } + else + { + + // Need one SIMD register as scratch. + // See genSIMDIntrinsicRelOp() for details on code sequence generated and + // the need for one scratch register. + // + // Note these intrinsics produce a BOOL result, hence internal float + // registers reserved are guaranteed to be different from target + // integer register without explicitly specifying. + info->internalFloatCount = 1; + info->setInternalCandidates(lsra, lsra->allSIMDRegs()); + } break; case SIMDIntrinsicDotProduct: - if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) || - (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32)) + // Float/Double vectors: + // For SSE, or AVX with 32-byte vectors, we also need an internal register + // as scratch. Further we need the targetReg and internal reg to be distinct + // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we + // don't need a tmpReg. + // + // 32-byte integer vector on SSE4/AVX: + // will take advantage of phaddd, which operates only on 128-bit xmm reg. + // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal + // registers since targetReg is an int type register. + // + // See genSIMDIntrinsicDotProduct() for details on code sequence generated + // and the need for scratch registers. + if (varTypeIsFloating(simdTree->gtSIMDBaseType)) { - // For SSE, or AVX with 32-byte vectors, we also need an internal register as scratch. - // Further we need the targetReg and internal reg to be distinct registers. - // This is achieved by requesting two internal registers; thus one of them - // will be different from targetReg. - // Note that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg. - // - // See genSIMDIntrinsicDotProduct() for details on code sequence generated and - // the need for scratch registers. - info->internalFloatCount = 2; + if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) || + (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32)) + { + info->internalFloatCount = 1; + info->isInternalRegDelayFree = true; + info->setInternalCandidates(lsra, lsra->allSIMDRegs()); + } + // else don't need scratch reg(s). + } + else + { + assert(simdTree->gtSIMDBaseType == TYP_INT && comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4); + + // No need to set isInternalRegDelayFree since targetReg is a + // an int type reg and guaranteed to be different from xmm/ymm + // regs. + info->internalFloatCount = comp->canUseAVX() ? 2 : 1; info->setInternalCandidates(lsra, lsra->allSIMDRegs()); } info->srcCount = 2; break; case SIMDIntrinsicGetItem: + { // This implements get_Item method. The sources are: // - the source SIMD struct // - index (which element to get) // The result is baseType of SIMD struct. info->srcCount = 2; + op1 = tree->gtOp.gtOp1; op2 = tree->gtOp.gtOp2; // If the index is a constant, mark it as contained. @@ -2446,48 +2899,69 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) info->srcCount = 1; } - // If the index is not a constant, we will use the SIMD temp location to store the vector. - // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we - // can use that in the process of extracting the element. - // - // If the index is a constant and base type is a small int we can use pextrw, but on AVX - // we will need a temp if are indexing into the upper half of the AVX register. - // In all other cases with constant index, we need a temp xmm register to extract the - // element if index is other than zero. - - if (!op2->IsCnsIntOrI()) + if (op1->isMemoryOp()) { - (void)comp->getSIMDInitTempVarNum(); + MakeSrcContained(tree, op1); + + // Although GT_IND of TYP_SIMD12 reserves an internal float + // register for reading 4 and 8 bytes from memory and + // assembling them into target XMM reg, it is not required + // in this case. + op1->gtLsraInfo.internalIntCount = 0; + op1->gtLsraInfo.internalFloatCount = 0; } - else if (!varTypeIsFloating(simdTree->gtSIMDBaseType)) + else { - bool needFloatTemp; - if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) && - (comp->getSIMDInstructionSet() == InstructionSet_AVX)) - { - int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType); - needFloatTemp = (byteShiftCnt >= 16); - } - else + // If the index is not a constant, we will use the SIMD temp location to store the vector. + // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we + // can use that in the process of extracting the element. + // + // If the index is a constant and base type is a small int we can use pextrw, but on AVX + // we will need a temp if are indexing into the upper half of the AVX register. + // In all other cases with constant index, we need a temp xmm register to extract the + // element if index is other than zero. + + if (!op2->IsCnsIntOrI()) { - needFloatTemp = !op2->IsIntegralConst(0); + (void)comp->getSIMDInitTempVarNum(); } - if (needFloatTemp) + else if (!varTypeIsFloating(simdTree->gtSIMDBaseType)) { - info->internalFloatCount = 1; - info->setInternalCandidates(lsra, lsra->allSIMDRegs()); + bool needFloatTemp; + if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) && + (comp->getSIMDInstructionSet() == InstructionSet_AVX)) + { + int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType); + needFloatTemp = (byteShiftCnt >= 16); + } + else + { + needFloatTemp = !op2->IsIntegralConst(0); + } + + if (needFloatTemp) + { + info->internalFloatCount = 1; + info->setInternalCandidates(lsra, lsra->allSIMDRegs()); + } } } - break; + } + break; case SIMDIntrinsicSetX: case SIMDIntrinsicSetY: case SIMDIntrinsicSetZ: case SIMDIntrinsicSetW: - // We need an internal integer register - info->srcCount = 2; - info->internalIntCount = 1; - info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT)); + info->srcCount = 2; + + // We need an internal integer register for SSE2 codegen + if (comp->getSIMDInstructionSet() == InstructionSet_SSE2) + { + info->internalIntCount = 1; + info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT)); + } + break; case SIMDIntrinsicCast: @@ -2592,6 +3066,8 @@ void Lowering::TreeNodeInfoInitCast(GenTree* tree) { if (genTypeSize(castOpType) == 8) { + // Here we don't need internal register to be different from targetReg, + // rather require it to be different from operand's reg. info->internalIntCount = 1; } } @@ -2693,7 +3169,6 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree) GenTreePtr index = nullptr; unsigned mul, cns; bool rev; - bool modifiedSources = false; #ifdef FEATURE_SIMD // If indirTree is of TYP_SIMD12, don't mark addr as contained @@ -2711,11 +3186,10 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree) info->internalFloatCount = 1; // In case of GT_IND we need an internal register different from targetReg and - // both of the registers are used at the same time. This achieved by reserving - // two internal registers + // both of the registers are used at the same time. if (indirTree->OperGet() == GT_IND) { - (info->internalFloatCount)++; + info->isInternalRegDelayFree = true; } info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs()); @@ -2724,16 +3198,21 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree) } #endif // FEATURE_SIMD - // These nodes go into an addr mode: - // - GT_CLS_VAR_ADDR turns into a constant. - // - GT_LCL_VAR_ADDR is a stack addr mode. - if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR)) + if ((indirTree->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0) { + // The address of an indirection that requires its address in a reg. + // Skip any further processing that might otherwise make it contained. + } + else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR)) + { + // These nodes go into an addr mode: + // - GT_CLS_VAR_ADDR turns into a constant. + // - GT_LCL_VAR_ADDR is a stack addr mode. + // make this contained, it turns into a constant that goes into an addr mode MakeSrcContained(indirTree, addr); } - else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp) && - addr->gtLsraInfo.getDstCandidates(m_lsra) != RBM_VIRTUAL_STUB_PARAM) + else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp)) { // Amd64: // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address. @@ -2755,17 +3234,10 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree) } else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr)) { - GenTreeAddrMode* lea = addr->AsAddrMode(); - base = lea->Base(); - index = lea->Index(); - - m_lsra->clearOperandCounts(addr); - // The srcCount is decremented because addr is now "contained", - // then we account for the base and index below, if they are non-null. - info->srcCount--; + MakeSrcContained(indirTree, addr); } else if (comp->codeGen->genCreateAddrMode(addr, -1, true, 0, &rev, &base, &index, &mul, &cns, true /*nogen*/) && - !(modifiedSources = AreSourcesPossiblyModifiedLocals(indirTree, base, index))) + !AreSourcesPossiblyModifiedLocals(indirTree, base, index)) { // An addressing mode will be constructed that may cause some // nodes to not need a register, and cause others' lifetimes to be extended @@ -2774,7 +3246,16 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree) assert(base != addr); m_lsra->clearOperandCounts(addr); - GenTreePtr arrLength = nullptr; + const bool hasBase = base != nullptr; + const bool hasIndex = index != nullptr; + assert(hasBase || hasIndex); // At least one of a base or an index must be present. + + // If the addressing mode has both a base and an index, bump its source count by one. If it only has one or the + // other, its source count is already correct (due to the source for the address itself). + if (hasBase && hasIndex) + { + info->srcCount++; + } // Traverse the computation below GT_IND to find the operands // for the addressing mode, marking the various constants and @@ -2784,14 +3265,13 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree) // up of simple arithmetic operators, and the code generator // only traverses one leg of each node. - bool foundBase = (base == nullptr); - bool foundIndex = (index == nullptr); - GenTreePtr nextChild = nullptr; - for (GenTreePtr child = addr; child != nullptr && !child->OperIsLeaf(); child = nextChild) + bool foundBase = !hasBase; + bool foundIndex = !hasIndex; + for (GenTree *child = addr, *nextChild = nullptr; child != nullptr && !child->OperIsLeaf(); child = nextChild) { - nextChild = nullptr; - GenTreePtr op1 = child->gtOp.gtOp1; - GenTreePtr op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr; + nextChild = nullptr; + GenTree* op1 = child->gtOp.gtOp1; + GenTree* op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr; if (op1 == base) { @@ -2832,7 +3312,6 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree) } } assert(foundBase && foundIndex); - info->srcCount--; // it gets incremented below. } else if (addr->gtOper == GT_ARR_ELEM) { @@ -2845,32 +3324,23 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree) assert(addr->gtLsraInfo.srcCount >= 2); addr->gtLsraInfo.srcCount -= 1; } - else - { - // it is nothing but a plain indir - info->srcCount--; // base gets added in below - base = addr; - } - - if (base != nullptr) - { - info->srcCount++; - } - - if (index != nullptr && !modifiedSources) - { - info->srcCount++; - } } -void Lowering::LowerCmp(GenTreePtr tree) +void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree) { + assert(tree->OperIsCompare()); + TreeNodeInfo* info = &(tree->gtLsraInfo); info->srcCount = 2; info->dstCount = 1; #ifdef _TARGET_X86_ + // If the compare is used by a jump, we just need to set the condition codes. If not, then we need + // to store the result into the low byte of a register, which requires the dst be a byteable register. + // We always set the dst candidates, though, because if this is compare is consumed by a jump, they + // won't be used. We might be able to use GTF_RELOP_JMP_USED to determine this case, but it's not clear + // that flag is maintained until this location (especially for decomposed long compares). info->setDstCandidates(m_lsra, RBM_BYTE_REGS); #endif // _TARGET_X86_ @@ -2894,9 +3364,9 @@ void Lowering::LowerCmp(GenTreePtr tree) #endif // !defined(_TARGET_64BIT_) // If either of op1 or op2 is floating point values, then we need to use - // ucomiss or ucomisd to compare, both of which support the following form - // ucomis[s|d] xmm, xmm/mem. That is only the second operand can be a memory - // op. + // ucomiss or ucomisd to compare, both of which support the following form: + // ucomis[s|d] xmm, xmm/mem + // That is only the second operand can be a memory op. // // Second operand is a memory Op: Note that depending on comparison operator, // the operands of ucomis[s|d] need to be reversed. Therefore, either op1 or @@ -2952,16 +3422,9 @@ void Lowering::LowerCmp(GenTreePtr tree) bool hasShortCast = false; if (CheckImmedAndMakeContained(tree, op2)) { - bool op1CanBeContained = (op1Type == op2Type); - if (!op1CanBeContained) - { - if (genTypeSize(op1Type) == genTypeSize(op2Type)) - { - // The constant is of the correct size, but we don't have an exact type match - // We can treat the isMemoryOp as "contained" - op1CanBeContained = true; - } - } + // If the types are the same, or if the constant is of the correct size, + // we can treat the isMemoryOp as contained. + bool op1CanBeContained = (genTypeSize(op1Type) == genTypeSize(op2Type)); // Do we have a short compare against a constant in op2 // @@ -3031,13 +3494,13 @@ void Lowering::LowerCmp(GenTreePtr tree) bool op1IsMadeContained = false; // When op1 is a GT_AND we can often generate a single "test" instruction - // instead of two instructions (an "and" instruction followed by a "cmp"/"test") + // instead of two instructions (an "and" instruction followed by a "cmp"/"test"). // - // This instruction can only be used for equality or inequality comparions. + // This instruction can only be used for equality or inequality comparisons. // and we must have a compare against zero. // // If we have a postive test for a single bit we can reverse the condition and - // make the compare be against zero + // make the compare be against zero. // // Example: // GT_EQ GT_NE @@ -3046,8 +3509,8 @@ void Lowering::LowerCmp(GenTreePtr tree) // / \ / \ // andOp1 GT_CNS (0x100) andOp1 GT_CNS (0x100) // - // We will mark the GT_AND node as contained if the tree is a equality compare with zero - // Additionally when we do this we also allow for a contained memory operand for "andOp1". + // We will mark the GT_AND node as contained if the tree is an equality compare with zero. + // Additionally, when we do this we also allow for a contained memory operand for "andOp1". // bool isEqualityCompare = (tree->gtOper == GT_EQ || tree->gtOper == GT_NE); @@ -3066,7 +3529,7 @@ void Lowering::LowerCmp(GenTreePtr tree) // so that we can generate a test instruction. // Reverse the equality comparison - tree->gtOper = (tree->gtOper == GT_EQ) ? GT_NE : GT_EQ; + tree->SetOperRaw((tree->gtOper == GT_EQ) ? GT_NE : GT_EQ); // Change the relOp2CnsVal to zero relOp2CnsVal = 0; @@ -3171,7 +3634,7 @@ void Lowering::LowerCmp(GenTreePtr tree) genTreeOps castOp1Oper = castOp1->OperGet(); bool safeOper = false; - // It is not always safe to change the gtType of 'castOp1' to TYP_UBYTE + // It is not always safe to change the gtType of 'castOp1' to TYP_UBYTE. // For example when 'castOp1Oper' is a GT_RSZ or GT_RSH then we are shifting // bits from the left into the lower bits. If we change the type to a TYP_UBYTE // we will instead generate a byte sized shift operation: shr al, 24 @@ -3196,22 +3659,24 @@ void Lowering::LowerCmp(GenTreePtr tree) // assert(!castOp1->gtOverflowEx()); // Must not be an overflow checking operation - GenTreePtr removeTreeNode = op1; - tree->gtOp.gtOp1 = castOp1; - op1 = castOp1; - castOp1->gtType = TYP_UBYTE; - - // trim down the value if castOp1 is an int constant since its type changed to UBYTE. - if (castOp1Oper == GT_CNS_INT) - { - castOp1->gtIntCon.gtIconVal = (UINT8)castOp1->gtIntCon.gtIconVal; - } - + // TODO-Cleanup: we're within "if (CheckImmedAndMakeContained(tree, op2))", so isn't + // the following condition always true? if (op2->isContainedIntOrIImmed()) { ssize_t val = (ssize_t)op2->AsIntConCommon()->IconValue(); if (val >= 0 && val <= 255) { + GenTreePtr removeTreeNode = op1; + tree->gtOp.gtOp1 = castOp1; + op1 = castOp1; + castOp1->gtType = TYP_UBYTE; + + // trim down the value if castOp1 is an int constant since its type changed to UBYTE. + if (castOp1Oper == GT_CNS_INT) + { + castOp1->gtIntCon.gtIconVal = (UINT8)castOp1->gtIntCon.gtIconVal; + } + op2->gtType = TYP_UBYTE; tree->gtFlags |= GTF_UNSIGNED; @@ -3222,18 +3687,26 @@ void Lowering::LowerCmp(GenTreePtr tree) MakeSrcContained(tree, op1); op1IsMadeContained = true; } - } - } - BlockRange().Remove(removeTreeNode); + BlockRange().Remove(removeTreeNode); + + // We've changed the type on op1 to TYP_UBYTE, but we already processed that node. + // We need to go back and mark it byteable. + // TODO-Cleanup: it might be better to move this out of the TreeNodeInfoInit pass to + // the earlier "lower" pass, in which case the byteable check would just fall out. + // But that is quite complex! + TreeNodeInfoInitCheckByteable(op1); + #ifdef DEBUG - if (comp->verbose) - { - printf("LowerCmp: Removing a GT_CAST to TYP_UBYTE and changing castOp1->gtType to " - "TYP_UBYTE\n"); - comp->gtDispTreeRange(BlockRange(), tree); - } + if (comp->verbose) + { + printf("TreeNodeInfoInitCmp: Removing a GT_CAST to TYP_UBYTE and changing " + "castOp1->gtType to TYP_UBYTE\n"); + comp->gtDispTreeRange(BlockRange(), tree); + } #endif + } + } } } @@ -3241,6 +3714,41 @@ void Lowering::LowerCmp(GenTreePtr tree) if (!op1IsMadeContained) { SetRegOptional(op1); + + // If op1 codegen sets ZF and SF flags and ==/!= against + // zero, we don't need to generate test instruction, + // provided we don't have another GenTree node between op1 + // and tree that could potentially modify flags. + // + // TODO-CQ: right now the below peep is inexpensive and + // gets the benefit in most of cases because in majority + // of cases op1, op2 and tree would be in that order in + // execution. In general we should be able to check that all + // the nodes that come after op1 in execution order do not + // modify the flags so that it is safe to avoid generating a + // test instruction. Such a check requires that on each + // GenTree node we need to set the info whether its codegen + // will modify flags. + // + // TODO-CQ: We can optimize compare against zero in the + // following cases by generating the branch as indicated + // against each case. + // 1) unsigned compare + // < 0 - always FALSE + // <= 0 - ZF=1 and jne + // > 0 - ZF=0 and je + // >= 0 - always TRUE + // + // 2) signed compare + // < 0 - SF=1 and js + // >= 0 - SF=0 and jns + if (isEqualityCompare && op1->gtSetZSFlags() && op2->IsIntegralConst(0) && (op1->gtNext == op2) && + (op2->gtNext == tree)) + { + // Require codegen of op1 to set the flags. + assert(!op1->gtSetFlags()); + op1->gtFlags |= GTF_SET_FLAGS; + } } } } @@ -3255,10 +3763,17 @@ void Lowering::LowerCmp(GenTreePtr tree) { MakeSrcContained(tree, op1); } + else if (op1->IsCnsIntOrI()) + { + // TODO-CQ: We should be able to support swapping op1 and op2 to generate cmp reg, imm, + // but there is currently an assert in CodeGen::genCompareInt(). + // https://github.com/dotnet/coreclr/issues/7270 + SetRegOptional(op2); + } else { // One of op1 or op2 could be marked as reg optional - // to indicate that codgen can still generate code + // to indicate that codegen can still generate code // if one of them is on stack. SetRegOptional(PreferredRegOptionalOperand(tree)); } @@ -3318,7 +3833,6 @@ void Lowering::LowerCast(GenTree* tree) var_types dstType = tree->CastToType(); var_types srcType = op1->TypeGet(); var_types tmpType = TYP_UNDEF; - bool srcUns = false; // force the srcType to unsigned if GT_UNSIGNED flag is set if (tree->gtFlags & GTF_UNSIGNED) @@ -3849,6 +4363,20 @@ bool Lowering::SetStoreIndOpCountsIfRMWMemOp(GenTreePtr storeInd) } m_lsra->clearOperandCounts(indirCandidateChild); +#ifdef _TARGET_X86_ + if (varTypeIsByte(storeInd)) + { + // If storeInd is of TYP_BYTE, set indirOpSources to byteable registers. + bool containedNode = indirOpSource->gtLsraInfo.dstCount == 0; + if (!containedNode) + { + regMaskTP regMask = indirOpSource->gtLsraInfo.getSrcCandidates(m_lsra); + assert(regMask != RBM_NONE); + indirOpSource->gtLsraInfo.setSrcCandidates(m_lsra, regMask & ~RBM_NON_BYTE_REGS); + } + } +#endif + return true; } @@ -3858,8 +4386,11 @@ bool Lowering::SetStoreIndOpCountsIfRMWMemOp(GenTreePtr storeInd) */ void Lowering::SetMulOpCounts(GenTreePtr tree) { +#if defined(_TARGET_X86_) + assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI || tree->OperGet() == GT_MUL_LONG); +#else assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI); - +#endif TreeNodeInfo* info = &(tree->gtLsraInfo); info->srcCount = 2; @@ -3900,13 +4431,18 @@ void Lowering::SetMulOpCounts(GenTreePtr tree) GenTreeIntConCommon* imm = nullptr; GenTreePtr other = nullptr; - // There are three forms of x86 multiply: - // one-op form: RDX:RAX = RAX * r/m - // two-op form: reg *= r/m - // three-op form: reg = r/m * imm +// There are three forms of x86 multiply: +// one-op form: RDX:RAX = RAX * r/m +// two-op form: reg *= r/m +// three-op form: reg = r/m * imm - // This special widening 32x32->64 MUL is not used on x64 - assert((tree->gtFlags & GTF_MUL_64RSLT) == 0); +// This special widening 32x32->64 MUL is not used on x64 +#if defined(_TARGET_X86_) + if (tree->OperGet() != GT_MUL_LONG) +#endif + { + assert((tree->gtFlags & GTF_MUL_64RSLT) == 0); + } // Multiply should never be using small types assert(!varTypeIsSmall(tree->TypeGet())); @@ -3924,12 +4460,21 @@ void Lowering::SetMulOpCounts(GenTreePtr tree) info->setDstCandidates(m_lsra, RBM_RAX); hasImpliedFirstOperand = true; } - else if (tree->gtOper == GT_MULHI) + else if (tree->OperGet() == GT_MULHI) + { + // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the + // upper 32 bits of the result set the destination candidate to REG_RDX. + info->setDstCandidates(m_lsra, RBM_RDX); + hasImpliedFirstOperand = true; + } +#if defined(_TARGET_X86_) + else if (tree->OperGet() == GT_MUL_LONG) { // have to use the encoding:RDX:RAX = RAX * rm info->setDstCandidates(m_lsra, RBM_RAX); hasImpliedFirstOperand = true; } +#endif else if (IsContainableImmed(tree, op2) || IsContainableImmed(tree, op1)) { if (IsContainableImmed(tree, op2)) @@ -4187,6 +4732,71 @@ GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree) return preferredOp; } +#ifdef _TARGET_X86_ +//------------------------------------------------------------------------ +// ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for +// various reasons +// +// Arguments: +// tree - The node of interest +// +// Return Value: +// If we need to exclude non-byteable registers +// +bool Lowering::ExcludeNonByteableRegisters(GenTree* tree) +{ + // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr' + // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT + // value. In this case we need to exclude esi/edi from the src candidates of op2. + if (varTypeIsByte(tree)) + { + return true; + } + // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool. + else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType())) + { + return true; + } + else if (tree->OperIsCompare()) + { + GenTree* op1 = tree->gtGetOp1(); + GenTree* op2 = tree->gtGetOp2(); + + // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses + // ubyte as the result of comparison and if the result needs to be materialized into a reg + // simply zero extend it to TYP_INT size. Here is an example of generated code: + // cmp dl, byte ptr[addr mode] + // movzx edx, dl + if (varTypeIsByte(op1) && varTypeIsByte(op2)) + { + return true; + } + // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses + // ubyte as the result of the comparison and if the result needs to be materialized into a reg + // simply zero extend it to TYP_INT size. + else if (varTypeIsByte(op1) && op2->IsCnsIntOrI()) + { + return true; + } + // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses + // ubyte as the result of the comparison and if the result needs to be materialized into a reg + // simply zero extend it to TYP_INT size. + else if (op1->IsCnsIntOrI() && varTypeIsByte(op2)) + { + return true; + } + else + { + return false; + } + } + else + { + return false; + } +} +#endif // _TARGET_X86_ + #endif // _TARGET_XARCH_ #endif // !LEGACY_BACKEND -- cgit v1.2.3