summaryrefslogtreecommitdiff
path: root/src/jit/lsraxarch.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/jit/lsraxarch.cpp')
-rw-r--r--src/jit/lsraxarch.cpp3684
1 files changed, 3684 insertions, 0 deletions
diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp
new file mode 100644
index 0000000000..a4da2b7ce6
--- /dev/null
+++ b/src/jit/lsraxarch.cpp
@@ -0,0 +1,3684 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX XX
+XX Register Requirements for AMD64 XX
+XX XX
+XX This encapsulates all the logic for setting register requirements for XX
+XX the AMD64 architecture. XX
+XX XX
+XX XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator
+
+#ifdef _TARGET_XARCH_
+
+#include "jit.h"
+#include "sideeffects.h"
+#include "lower.h"
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitStoreLoc: Set register requirements for a store of a lclVar
+//
+// Arguments:
+// storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
+//
+// Notes:
+// This involves:
+// - Setting the appropriate candidates for a store of a multi-reg call return value.
+// - Requesting an internal register for SIMD12 stores.
+// - Handling of contained immediates.
+// - Widening operations of unsigneds. (TODO: Move to 1st phase of Lowering)
+
+void Lowering::TreeNodeInfoInitStoreLoc(GenTreeLclVarCommon* storeLoc)
+{
+ TreeNodeInfo* info = &(storeLoc->gtLsraInfo);
+
+ // Is this the case of var = call where call is returning
+ // a value in multiple return registers?
+ GenTree* op1 = storeLoc->gtGetOp1();
+ if (op1->IsMultiRegCall())
+ {
+ // backend expects to see this case only for store lclvar.
+ assert(storeLoc->OperGet() == GT_STORE_LCL_VAR);
+
+ // srcCount = number of registers in which the value is returned by call
+ GenTreeCall* call = op1->AsCall();
+ ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+ info->srcCount = retTypeDesc->GetReturnRegCount();
+
+ // Call node srcCandidates = Bitwise-OR(allregs(GetReturnRegType(i))) for all i=0..RetRegCount-1
+ regMaskTP srcCandidates = m_lsra->allMultiRegCallNodeRegs(call);
+ op1->gtLsraInfo.setSrcCandidates(m_lsra, srcCandidates);
+ return;
+ }
+
+#ifdef FEATURE_SIMD
+ if (varTypeIsSIMD(storeLoc))
+ {
+ if (op1->IsCnsIntOrI())
+ {
+ // InitBlk
+ MakeSrcContained(storeLoc, op1);
+ }
+ else if (storeLoc->TypeGet() == TYP_SIMD12)
+ {
+ // Need an additional register to extract upper 4 bytes of Vector3.
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
+
+ // In this case don't mark the operand as contained as we want it to
+ // be evaluated into an xmm register
+ }
+ return;
+ }
+#endif // FEATURE_SIMD
+
+ // If the source is a containable immediate, make it contained, unless it is
+ // an int-size or larger store of zero to memory, because we can generate smaller code
+ // by zeroing a register and then storing it.
+ if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(storeLoc)))
+ {
+ MakeSrcContained(storeLoc, op1);
+ }
+
+ // TODO: This should be moved to Lowering, but it widens the types, which changes the behavior
+ // of the above condition.
+ LowerStoreLoc(storeLoc);
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInit: Set register requirements for a node
+//
+// Arguments:
+// treeNode - the node of interest
+//
+// Notes:
+// Preconditions:
+// LSRA Has been initialized and there is a TreeNodeInfo node
+// already allocated and initialized for every tree in the IR.
+// Postconditions:
+// Every TreeNodeInfo instance has the right annotations on register
+// requirements needed by LSRA to build the Interval Table (source,
+// destination and internal [temp] register counts).
+//
+void Lowering::TreeNodeInfoInit(GenTree* tree)
+{
+ LinearScan* l = m_lsra;
+ Compiler* compiler = comp;
+
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+#ifdef DEBUG
+ if (comp->verbose)
+ {
+ printf("TreeNodeInfoInit:\n");
+ comp->gtDispTreeRange(BlockRange(), tree);
+ }
+#endif
+ // floating type generates AVX instruction (vmovss etc.), set the flag
+ SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
+ switch (tree->OperGet())
+ {
+ GenTree* op1;
+ GenTree* op2;
+
+ default:
+ TreeNodeInfoInitSimple(tree);
+ break;
+
+ case GT_LCL_FLD:
+ case GT_LCL_VAR:
+ info->srcCount = 0;
+ info->dstCount = 1;
+
+#ifdef FEATURE_SIMD
+ // Need an additional register to read upper 4 bytes of Vector3.
+ if (tree->TypeGet() == TYP_SIMD12)
+ {
+ // We need an internal register different from targetReg in which 'tree' produces its result
+ // because both targetReg and internal reg will be in use at the same time.
+ info->internalFloatCount = 1;
+ info->isInternalRegDelayFree = true;
+ info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
+ }
+#endif
+ break;
+
+ case GT_STORE_LCL_FLD:
+ case GT_STORE_LCL_VAR:
+#ifdef _TARGET_X86_
+ if (tree->gtGetOp1()->OperGet() == GT_LONG)
+ {
+ info->srcCount = 2;
+ }
+ else
+#endif // _TARGET_X86_
+ {
+ info->srcCount = 1;
+ }
+ info->dstCount = 0;
+ TreeNodeInfoInitStoreLoc(tree->AsLclVarCommon());
+ break;
+
+ case GT_BOX:
+ noway_assert(!"box should not exist here");
+ // The result of 'op1' is also the final result
+ info->srcCount = 0;
+ info->dstCount = 0;
+ break;
+
+ case GT_PHYSREGDST:
+ info->srcCount = 1;
+ info->dstCount = 0;
+ break;
+
+ case GT_COMMA:
+ {
+ GenTreePtr firstOperand;
+ GenTreePtr secondOperand;
+ if (tree->gtFlags & GTF_REVERSE_OPS)
+ {
+ firstOperand = tree->gtOp.gtOp2;
+ secondOperand = tree->gtOp.gtOp1;
+ }
+ else
+ {
+ firstOperand = tree->gtOp.gtOp1;
+ secondOperand = tree->gtOp.gtOp2;
+ }
+ if (firstOperand->TypeGet() != TYP_VOID)
+ {
+ firstOperand->gtLsraInfo.isLocalDefUse = true;
+ firstOperand->gtLsraInfo.dstCount = 0;
+ }
+ if (tree->TypeGet() == TYP_VOID && secondOperand->TypeGet() != TYP_VOID)
+ {
+ secondOperand->gtLsraInfo.isLocalDefUse = true;
+ secondOperand->gtLsraInfo.dstCount = 0;
+ }
+ }
+ info->srcCount = 0;
+ info->dstCount = 0;
+ break;
+
+ case GT_LIST:
+ case GT_FIELD_LIST:
+ case GT_ARGPLACE:
+ case GT_NO_OP:
+ case GT_START_NONGC:
+ case GT_PROF_HOOK:
+ info->srcCount = 0;
+ info->dstCount = 0;
+ break;
+
+ case GT_CNS_DBL:
+ info->srcCount = 0;
+ info->dstCount = 1;
+ break;
+
+#if !defined(_TARGET_64BIT_)
+
+ case GT_LONG:
+ if ((tree->gtLIRFlags & LIR::Flags::IsUnusedValue) != 0)
+ {
+ // An unused GT_LONG node needs to consume its sources.
+ info->srcCount = 2;
+ }
+ else
+ {
+ // Passthrough
+ info->srcCount = 0;
+ }
+
+ info->dstCount = 0;
+ break;
+
+#endif // !defined(_TARGET_64BIT_)
+
+ case GT_QMARK:
+ case GT_COLON:
+ info->srcCount = 0;
+ info->dstCount = 0;
+ unreached();
+ break;
+
+ case GT_RETURN:
+ TreeNodeInfoInitReturn(tree);
+ break;
+
+ case GT_RETFILT:
+ if (tree->TypeGet() == TYP_VOID)
+ {
+ info->srcCount = 0;
+ info->dstCount = 0;
+ }
+ else
+ {
+ assert(tree->TypeGet() == TYP_INT);
+
+ info->srcCount = 1;
+ info->dstCount = 0;
+
+ info->setSrcCandidates(l, RBM_INTRET);
+ tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET);
+ }
+ break;
+
+ // A GT_NOP is either a passthrough (if it is void, or if it has
+ // a child), but must be considered to produce a dummy value if it
+ // has a type but no child
+ case GT_NOP:
+ info->srcCount = 0;
+ if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
+ {
+ info->dstCount = 1;
+ }
+ else
+ {
+ info->dstCount = 0;
+ }
+ break;
+
+ case GT_JTRUE:
+ {
+ info->srcCount = 0;
+ info->dstCount = 0;
+
+ GenTree* cmp = tree->gtGetOp1();
+ l->clearDstCount(cmp);
+
+#ifdef FEATURE_SIMD
+ // Say we have the following IR
+ // simdCompareResult = GT_SIMD((In)Equality, v1, v2)
+ // integerCompareResult = GT_EQ/NE(simdCompareResult, true/false)
+ // GT_JTRUE(integerCompareResult)
+ //
+ // In this case we don't need to generate code for GT_EQ_/NE, since SIMD (In)Equality
+ // intrinsic will set or clear the Zero flag.
+
+ genTreeOps cmpOper = cmp->OperGet();
+ if (cmpOper == GT_EQ || cmpOper == GT_NE)
+ {
+ GenTree* cmpOp1 = cmp->gtGetOp1();
+ GenTree* cmpOp2 = cmp->gtGetOp2();
+
+ if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1)))
+ {
+ // We always generate code for a SIMD equality comparison, but the compare
+ // is contained (evaluated as part of the GT_JTRUE).
+ // Neither the SIMD node nor the immediate need to be evaluated into a register.
+ l->clearOperandCounts(cmp);
+ l->clearDstCount(cmpOp1);
+ l->clearOperandCounts(cmpOp2);
+
+ // Codegen of SIMD (in)Equality uses target integer reg only for setting flags.
+ // A target reg is not needed on AVX when comparing against Vector Zero.
+ // In all other cases we need to reserve an int type internal register, since we
+ // have cleared dstCount.
+ if (!compiler->canUseAVX() || !cmpOp1->gtGetOp2()->IsIntegralConstVector(0))
+ {
+ ++(cmpOp1->gtLsraInfo.internalIntCount);
+ regMaskTP internalCandidates = cmpOp1->gtLsraInfo.getInternalCandidates(l);
+ internalCandidates |= l->allRegs(TYP_INT);
+ cmpOp1->gtLsraInfo.setInternalCandidates(l, internalCandidates);
+ }
+
+ // We have to reverse compare oper in the following cases:
+ // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it.
+ // Therefore, if compare oper is == or != against false(0), we will
+ // be checking opposite of what is required.
+ //
+ // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it.
+ // Therefore, if compare oper is == or != against true(1), we will
+ // be checking opposite of what is required.
+ GenTreeSIMD* simdNode = cmpOp1->AsSIMD();
+ if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality)
+ {
+ if (cmpOp2->IsIntegralConst(0))
+ {
+ cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+ }
+ }
+ else
+ {
+ assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality);
+ if (cmpOp2->IsIntegralConst(1))
+ {
+ cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+ }
+ }
+ }
+ }
+#endif // FEATURE_SIMD
+ }
+ break;
+
+ case GT_JCC:
+ info->srcCount = 0;
+ info->dstCount = 0;
+ break;
+
+ case GT_JMP:
+ info->srcCount = 0;
+ info->dstCount = 0;
+ break;
+
+ case GT_SWITCH:
+ // This should never occur since switch nodes must not be visible at this
+ // point in the JIT.
+ info->srcCount = 0;
+ info->dstCount = 0; // To avoid getting uninit errors.
+ noway_assert(!"Switch must be lowered at this point");
+ break;
+
+ case GT_JMPTABLE:
+ info->srcCount = 0;
+ info->dstCount = 1;
+ break;
+
+ case GT_SWITCH_TABLE:
+ info->srcCount = 2;
+ info->internalIntCount = 1;
+ info->dstCount = 0;
+ break;
+
+ case GT_ASG:
+ case GT_ASG_ADD:
+ case GT_ASG_SUB:
+ noway_assert(!"We should never hit any assignment operator in lowering");
+ info->srcCount = 0;
+ info->dstCount = 0;
+ break;
+
+#if !defined(_TARGET_64BIT_)
+ case GT_ADD_LO:
+ case GT_ADD_HI:
+ case GT_SUB_LO:
+ case GT_SUB_HI:
+#endif
+ case GT_ADD:
+ case GT_SUB:
+ // SSE2 arithmetic instructions doesn't support the form "op mem, xmm".
+ // Rather they only support "op xmm, mem/xmm" form.
+ if (varTypeIsFloating(tree->TypeGet()))
+ {
+ // overflow operations aren't supported on float/double types.
+ assert(!tree->gtOverflow());
+
+ op1 = tree->gtGetOp1();
+ op2 = tree->gtGetOp2();
+
+ // No implicit conversions at this stage as the expectation is that
+ // everything is made explicit by adding casts.
+ assert(op1->TypeGet() == op2->TypeGet());
+
+ info->srcCount = 2;
+ info->dstCount = 1;
+
+ if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
+ {
+ MakeSrcContained(tree, op2);
+ }
+ else if (tree->OperIsCommutative() &&
+ (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))))
+ {
+ // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
+ // as long as it is safe so that the following efficient code sequence is generated:
+ // addss/sd targetReg, memOp (if op1Reg == targetReg) OR
+ // movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
+ //
+ // Instead of
+ // movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg (if op1Reg == targetReg) OR
+ // movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg
+ MakeSrcContained(tree, op1);
+ }
+ else
+ {
+ // If there are no containable operands, we can make an operand reg optional.
+ SetRegOptionalForBinOp(tree);
+ }
+ break;
+ }
+
+ __fallthrough;
+
+ case GT_AND:
+ case GT_OR:
+ case GT_XOR:
+ TreeNodeInfoInitLogicalOp(tree);
+ break;
+
+ case GT_RETURNTRAP:
+ // This just turns into a compare of its child with an int + a conditional call
+ info->srcCount = 1;
+ info->dstCount = 0;
+ if (tree->gtOp.gtOp1->isIndir())
+ {
+ MakeSrcContained(tree, tree->gtOp.gtOp1);
+ }
+ info->internalIntCount = 1;
+ info->setInternalCandidates(l, l->allRegs(TYP_INT));
+ break;
+
+ case GT_MOD:
+ case GT_DIV:
+ case GT_UMOD:
+ case GT_UDIV:
+ TreeNodeInfoInitModDiv(tree);
+ break;
+
+ case GT_MUL:
+ case GT_MULHI:
+#if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
+ case GT_MUL_LONG:
+#endif
+ TreeNodeInfoInitMul(tree);
+ break;
+
+ case GT_INTRINSIC:
+ TreeNodeInfoInitIntrinsic(tree);
+ break;
+
+#ifdef FEATURE_SIMD
+ case GT_SIMD:
+ TreeNodeInfoInitSIMD(tree);
+ break;
+#endif // FEATURE_SIMD
+
+ case GT_CAST:
+ TreeNodeInfoInitCast(tree);
+ break;
+
+ case GT_NEG:
+ info->srcCount = 1;
+ info->dstCount = 1;
+
+ // TODO-XArch-CQ:
+ // SSE instruction set doesn't have an instruction to negate a number.
+ // The recommended way is to xor the float/double number with a bitmask.
+ // The only way to xor is using xorps or xorpd both of which operate on
+ // 128-bit operands. To hold the bit-mask we would need another xmm
+ // register or a 16-byte aligned 128-bit data constant. Right now emitter
+ // lacks the support for emitting such constants or instruction with mem
+ // addressing mode referring to a 128-bit operand. For now we use an
+ // internal xmm register to load 32/64-bit bitmask from data section.
+ // Note that by trading additional data section memory (128-bit) we can
+ // save on the need for an internal register and also a memory-to-reg
+ // move.
+ //
+ // Note: another option to avoid internal register requirement is by
+ // lowering as GT_SUB(0, src). This will generate code different from
+ // Jit64 and could possibly result in compat issues (?).
+ if (varTypeIsFloating(tree))
+ {
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(l, l->internalFloatRegCandidates());
+ }
+ else
+ {
+ // Codegen of this tree node sets ZF and SF flags.
+ tree->gtFlags |= GTF_ZSF_SET;
+ }
+ break;
+
+ case GT_NOT:
+ info->srcCount = 1;
+ info->dstCount = 1;
+ break;
+
+ case GT_LSH:
+ case GT_RSH:
+ case GT_RSZ:
+ case GT_ROL:
+ case GT_ROR:
+#ifdef _TARGET_X86_
+ case GT_LSH_HI:
+ case GT_RSH_LO:
+#endif
+ TreeNodeInfoInitShiftRotate(tree);
+ break;
+
+ case GT_EQ:
+ case GT_NE:
+ case GT_LT:
+ case GT_LE:
+ case GT_GE:
+ case GT_GT:
+ case GT_TEST_EQ:
+ case GT_TEST_NE:
+ TreeNodeInfoInitCmp(tree);
+ break;
+
+ case GT_CKFINITE:
+ info->srcCount = 1;
+ info->dstCount = 1;
+ info->internalIntCount = 1;
+ break;
+
+ case GT_CMPXCHG:
+ info->srcCount = 3;
+ info->dstCount = 1;
+
+ // comparand is preferenced to RAX.
+ // Remaining two operands can be in any reg other than RAX.
+ tree->gtCmpXchg.gtOpComparand->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+ tree->gtCmpXchg.gtOpLocation->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
+ tree->gtCmpXchg.gtOpValue->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
+ tree->gtLsraInfo.setDstCandidates(l, RBM_RAX);
+ break;
+
+ case GT_LOCKADD:
+ info->srcCount = 2;
+ info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+
+ CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
+ break;
+
+ case GT_CALL:
+ TreeNodeInfoInitCall(tree->AsCall());
+ break;
+
+ case GT_ADDR:
+ {
+ // For a GT_ADDR, the child node should not be evaluated into a register
+ GenTreePtr child = tree->gtOp.gtOp1;
+ assert(!l->isCandidateLocalRef(child));
+ l->clearDstCount(child);
+ info->srcCount = 0;
+ info->dstCount = 1;
+ }
+ break;
+
+#if !defined(FEATURE_PUT_STRUCT_ARG_STK)
+ case GT_OBJ:
+#endif
+ case GT_BLK:
+ case GT_DYN_BLK:
+ // These should all be eliminated prior to Lowering.
+ assert(!"Non-store block node in Lowering");
+ info->srcCount = 0;
+ info->dstCount = 0;
+ break;
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+ case GT_PUTARG_STK:
+ LowerPutArgStk(tree->AsPutArgStk());
+ TreeNodeInfoInitPutArgStk(tree->AsPutArgStk());
+ break;
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+
+ case GT_STORE_BLK:
+ case GT_STORE_OBJ:
+ case GT_STORE_DYN_BLK:
+ LowerBlockStore(tree->AsBlk());
+ TreeNodeInfoInitBlockStore(tree->AsBlk());
+ break;
+
+ case GT_INIT_VAL:
+ // Always a passthrough of its child's value.
+ info->srcCount = 0;
+ info->dstCount = 0;
+ break;
+
+ case GT_LCLHEAP:
+ TreeNodeInfoInitLclHeap(tree);
+ break;
+
+ case GT_ARR_BOUNDS_CHECK:
+#ifdef FEATURE_SIMD
+ case GT_SIMD_CHK:
+#endif // FEATURE_SIMD
+ {
+ GenTreeBoundsChk* node = tree->AsBoundsChk();
+ // Consumes arrLen & index - has no result
+ info->srcCount = 2;
+ info->dstCount = 0;
+
+ GenTreePtr other;
+ if (CheckImmedAndMakeContained(tree, node->gtIndex))
+ {
+ other = node->gtArrLen;
+ }
+ else if (CheckImmedAndMakeContained(tree, node->gtArrLen))
+ {
+ other = node->gtIndex;
+ }
+ else if (node->gtIndex->isMemoryOp())
+ {
+ other = node->gtIndex;
+ }
+ else
+ {
+ other = node->gtArrLen;
+ }
+
+ if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
+ {
+ if (other->isMemoryOp())
+ {
+ MakeSrcContained(tree, other);
+ }
+ else
+ {
+ // We can mark 'other' as reg optional, since it is not contained.
+ SetRegOptional(other);
+ }
+ }
+ }
+ break;
+
+ case GT_ARR_ELEM:
+ // These must have been lowered to GT_ARR_INDEX
+ noway_assert(!"We should never see a GT_ARR_ELEM in lowering");
+ info->srcCount = 0;
+ info->dstCount = 0;
+ break;
+
+ case GT_ARR_INDEX:
+ info->srcCount = 2;
+ info->dstCount = 1;
+ // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
+ // times while the result is being computed.
+ tree->AsArrIndex()->ArrObj()->gtLsraInfo.isDelayFree = true;
+ info->hasDelayFreeSrc = true;
+ break;
+
+ case GT_ARR_OFFSET:
+ // This consumes the offset, if any, the arrObj and the effective index,
+ // and produces the flattened offset for this dimension.
+ info->srcCount = 3;
+ info->dstCount = 1;
+
+ // we don't want to generate code for this
+ if (tree->gtArrOffs.gtOffset->IsIntegralConst(0))
+ {
+ MakeSrcContained(tree, tree->gtArrOffs.gtOffset);
+ }
+ else
+ {
+ // Here we simply need an internal register, which must be different
+ // from any of the operand's registers, but may be the same as targetReg.
+ info->internalIntCount = 1;
+ }
+ break;
+
+ case GT_LEA:
+ // The LEA usually passes its operands through to the GT_IND, in which case we'll
+ // clear the info->srcCount and info->dstCount later, but we may be instantiating an address,
+ // so we set them here.
+ info->srcCount = 0;
+ if (tree->AsAddrMode()->HasBase())
+ {
+ info->srcCount++;
+ }
+ if (tree->AsAddrMode()->HasIndex())
+ {
+ info->srcCount++;
+ }
+ info->dstCount = 1;
+ break;
+
+ case GT_STOREIND:
+ {
+ info->srcCount = 2;
+ info->dstCount = 0;
+ GenTree* src = tree->gtOp.gtOp2;
+
+ if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
+ {
+ TreeNodeInfoInitGCWriteBarrier(tree);
+ break;
+ }
+
+ // If the source is a containable immediate, make it contained, unless it is
+ // an int-size or larger store of zero to memory, because we can generate smaller code
+ // by zeroing a register and then storing it.
+ if (IsContainableImmed(tree, src) &&
+ (!src->IsIntegralConst(0) || varTypeIsSmall(tree) || tree->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
+ {
+ MakeSrcContained(tree, src);
+ }
+ else if (!varTypeIsFloating(tree))
+ {
+ // Perform recognition of trees with the following structure:
+ // StoreInd(addr, BinOp(expr, GT_IND(addr)))
+ // to be able to fold this into an instruction of the form
+ // BINOP [addr], register
+ // where register is the actual place where 'expr' is computed.
+ //
+ // SSE2 doesn't support RMW form of instructions.
+ if (TreeNodeInfoInitIfRMWMemOp(tree))
+ {
+ break;
+ }
+ }
+
+ TreeNodeInfoInitIndir(tree);
+ }
+ break;
+
+ case GT_NULLCHECK:
+ info->dstCount = 0;
+ info->srcCount = 1;
+ info->isLocalDefUse = true;
+ break;
+
+ case GT_IND:
+ info->dstCount = 1;
+ info->srcCount = 1;
+ TreeNodeInfoInitIndir(tree);
+ break;
+
+ case GT_CATCH_ARG:
+ info->srcCount = 0;
+ info->dstCount = 1;
+ info->setDstCandidates(l, RBM_EXCEPTION_OBJECT);
+ break;
+
+#if !FEATURE_EH_FUNCLETS
+ case GT_END_LFIN:
+ info->srcCount = 0;
+ info->dstCount = 0;
+ break;
+#endif
+
+ case GT_CLS_VAR:
+ // These nodes are eliminated by rationalizer.
+ JITDUMP("Unexpected node %s in Lower.\n", GenTree::NodeName(tree->OperGet()));
+ unreached();
+ break;
+ } // end switch (tree->OperGet())
+
+ // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
+ // Even then we would like to set isTgtPref on Op1.
+ if (tree->OperIsBinary() && info->srcCount >= 1)
+ {
+ if (isRMWRegOper(tree))
+ {
+ GenTree* op1 = tree->gtOp.gtOp1;
+ GenTree* op2 = tree->gtOp.gtOp2;
+
+ // Commutative opers like add/mul/and/or/xor could reverse the order of
+ // operands if it is safe to do so. In such a case we would like op2 to be
+ // target preferenced instead of op1.
+ if (tree->OperIsCommutative() && op1->gtLsraInfo.dstCount == 0 && op2 != nullptr)
+ {
+ op1 = op2;
+ op2 = tree->gtOp.gtOp1;
+ }
+
+ // If we have a read-modify-write operation, we want to preference op1 to the target.
+ // If op1 is contained, we don't want to preference it, but it won't
+ // show up as a source in that case, so it will be ignored.
+ op1->gtLsraInfo.isTgtPref = true;
+
+ // Is this a non-commutative operator, or is op2 a contained memory op?
+ // (Note that we can't call IsContained() at this point because it uses exactly the
+ // same information we're currently computing.)
+ // In either case, we need to make op2 remain live until the op is complete, by marking
+ // the source(s) associated with op2 as "delayFree".
+ // Note that if op2 of a binary RMW operator is a memory op, even if the operator
+ // is commutative, codegen cannot reverse them.
+ // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
+ // more work to be done to correctly reverse the operands if they involve memory
+ // operands. Also, we may need to handle more cases than GT_IND, especially once
+ // we've modified the register allocator to not require all nodes to be assigned
+ // a register (e.g. a spilled lclVar can often be referenced directly from memory).
+ // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
+
+ GenTree* delayUseSrc = nullptr;
+ // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
+ // to special case them.
+ if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD)
+ {
+ // These tree nodes will have their op1 marked as isDelayFree=true.
+ // Hence these tree nodes should have a Def position so that op1's reg
+ // gets freed at DefLoc+1.
+ if (tree->TypeGet() == TYP_VOID)
+ {
+ // Right now a GT_XADD node could be morphed into a
+ // GT_LOCKADD of TYP_VOID. See gtExtractSideEffList().
+ // Note that it is advantageous to use GT_LOCKADD
+ // instead of of GT_XADD as the former uses lock.add,
+ // which allows its second operand to be a contained
+ // immediate wheres xadd instruction requires its
+ // second operand to be in a register.
+ assert(tree->gtLsraInfo.dstCount == 0);
+
+ // Give it an artificial type and mark it isLocalDefUse = true.
+ // This would result in a Def position created but not considered
+ // consumed by its parent node.
+ tree->gtType = TYP_INT;
+ tree->gtLsraInfo.isLocalDefUse = true;
+ }
+ else
+ {
+ assert(tree->gtLsraInfo.dstCount != 0);
+ }
+
+ delayUseSrc = op1;
+ }
+ else if ((op2 != nullptr) &&
+ (!tree->OperIsCommutative() || (op2->isMemoryOp() && (op2->gtLsraInfo.srcCount == 0))))
+ {
+ delayUseSrc = op2;
+ }
+ if (delayUseSrc != nullptr)
+ {
+ // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree'
+ // on the base & index, if any.
+ // Otherwise, we set it on delayUseSrc itself.
+ if (delayUseSrc->isIndir() && (delayUseSrc->gtLsraInfo.dstCount == 0))
+ {
+ GenTree* base = delayUseSrc->AsIndir()->Base();
+ GenTree* index = delayUseSrc->AsIndir()->Index();
+ if (base != nullptr)
+ {
+ base->gtLsraInfo.isDelayFree = true;
+ }
+ if (index != nullptr)
+ {
+ index->gtLsraInfo.isDelayFree = true;
+ }
+ }
+ else
+ {
+ delayUseSrc->gtLsraInfo.isDelayFree = true;
+ }
+ info->hasDelayFreeSrc = true;
+ }
+ }
+ }
+
+ TreeNodeInfoInitCheckByteable(tree);
+
+ // We need to be sure that we've set info->srcCount and info->dstCount appropriately
+ assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCheckByteable: Check the tree to see if "byte-able" registers are
+// required, and set the tree node info accordingly.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitCheckByteable(GenTree* tree)
+{
+#ifdef _TARGET_X86_
+ LinearScan* l = m_lsra;
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+ // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
+ // if the tree node is a byte type.
+ //
+ // Though this looks conservative in theory, in practice we could not think of a case where
+ // the below logic leads to conservative register specification. In future when or if we find
+ // one such case, this logic needs to be fine tuned for that case(s).
+
+ if (ExcludeNonByteableRegisters(tree))
+ {
+ regMaskTP regMask;
+ if (info->dstCount > 0)
+ {
+ regMask = info->getDstCandidates(l);
+ assert(regMask != RBM_NONE);
+ info->setDstCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
+ }
+
+ if (tree->OperIsSimple() && (info->srcCount > 0))
+ {
+ // No need to set src candidates on a contained child operand.
+ GenTree* op = tree->gtOp.gtOp1;
+ assert(op != nullptr);
+ bool containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
+ if (!containedNode)
+ {
+ regMask = op->gtLsraInfo.getSrcCandidates(l);
+ assert(regMask != RBM_NONE);
+ op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
+ }
+
+ if (tree->OperIsBinary() && (tree->gtOp.gtOp2 != nullptr))
+ {
+ op = tree->gtOp.gtOp2;
+ containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
+ if (!containedNode)
+ {
+ regMask = op->gtLsraInfo.getSrcCandidates(l);
+ assert(regMask != RBM_NONE);
+ op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
+ }
+ }
+ }
+ }
+#endif //_TARGET_X86_
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitSimple: Sets the srcCount and dstCount for all the trees
+// without special handling based on the tree node type.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitSimple(GenTree* tree)
+{
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+ unsigned kind = tree->OperKind();
+ info->dstCount = tree->IsValue() ? 1 : 0;
+ if (kind & (GTK_CONST | GTK_LEAF))
+ {
+ info->srcCount = 0;
+ }
+ else if (kind & (GTK_SMPOP))
+ {
+ if (tree->gtGetOp2IfPresent() != nullptr)
+ {
+ info->srcCount = 2;
+ }
+ else
+ {
+ info->srcCount = 1;
+ }
+ }
+ else
+ {
+ unreached();
+ }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitReturn: Set the NodeInfo for a GT_RETURN.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitReturn(GenTree* tree)
+{
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+ LinearScan* l = m_lsra;
+ Compiler* compiler = comp;
+
+#if !defined(_TARGET_64BIT_)
+ if (tree->TypeGet() == TYP_LONG)
+ {
+ GenTree* op1 = tree->gtGetOp1();
+ noway_assert(op1->OperGet() == GT_LONG);
+ GenTree* loVal = op1->gtGetOp1();
+ GenTree* hiVal = op1->gtGetOp2();
+ info->srcCount = 2;
+ loVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_LO);
+ hiVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_HI);
+ info->dstCount = 0;
+ }
+ else
+#endif // !defined(_TARGET_64BIT_)
+ {
+ GenTree* op1 = tree->gtGetOp1();
+ regMaskTP useCandidates = RBM_NONE;
+
+ info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+ info->dstCount = 0;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+ if (varTypeIsStruct(tree))
+ {
+ // op1 has to be either an lclvar or a multi-reg returning call
+ if (op1->OperGet() == GT_LCL_VAR)
+ {
+ GenTreeLclVarCommon* lclVarCommon = op1->AsLclVarCommon();
+ LclVarDsc* varDsc = &(compiler->lvaTable[lclVarCommon->gtLclNum]);
+ assert(varDsc->lvIsMultiRegRet);
+
+ // Mark var as contained if not enregistrable.
+ if (!varTypeIsEnregisterableStruct(op1))
+ {
+ MakeSrcContained(tree, op1);
+ }
+ }
+ else
+ {
+ noway_assert(op1->IsMultiRegCall());
+
+ ReturnTypeDesc* retTypeDesc = op1->AsCall()->GetReturnTypeDesc();
+ info->srcCount = retTypeDesc->GetReturnRegCount();
+ useCandidates = retTypeDesc->GetABIReturnRegs();
+ }
+ }
+ else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+ {
+ // Non-struct type return - determine useCandidates
+ switch (tree->TypeGet())
+ {
+ case TYP_VOID:
+ useCandidates = RBM_NONE;
+ break;
+ case TYP_FLOAT:
+ useCandidates = RBM_FLOATRET;
+ break;
+ case TYP_DOUBLE:
+ useCandidates = RBM_DOUBLERET;
+ break;
+#if defined(_TARGET_64BIT_)
+ case TYP_LONG:
+ useCandidates = RBM_LNGRET;
+ break;
+#endif // defined(_TARGET_64BIT_)
+ default:
+ useCandidates = RBM_INTRET;
+ break;
+ }
+ }
+
+ if (useCandidates != RBM_NONE)
+ {
+ op1->gtLsraInfo.setSrcCandidates(l, useCandidates);
+ }
+ }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitShiftRotate: Set the NodeInfo for a shift or rotate.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree)
+{
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+ LinearScan* l = m_lsra;
+
+ info->srcCount = 2;
+ info->dstCount = 1;
+
+ // For shift operations, we need that the number
+ // of bits moved gets stored in CL in case
+ // the number of bits to shift is not a constant.
+ GenTreePtr shiftBy = tree->gtOp.gtOp2;
+ GenTreePtr source = tree->gtOp.gtOp1;
+
+#ifdef _TARGET_X86_
+ // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
+ // we can have a three operand form. Increment the srcCount.
+ if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
+ {
+ assert(source->OperGet() == GT_LONG);
+
+ info->srcCount++;
+
+ if (tree->OperGet() == GT_LSH_HI)
+ {
+ GenTreePtr sourceLo = source->gtOp.gtOp1;
+ sourceLo->gtLsraInfo.isDelayFree = true;
+ }
+ else
+ {
+ GenTreePtr sourceHi = source->gtOp.gtOp2;
+ sourceHi->gtLsraInfo.isDelayFree = true;
+ }
+
+ source->gtLsraInfo.hasDelayFreeSrc = true;
+ info->hasDelayFreeSrc = true;
+ }
+#endif
+
+ // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
+ // We will allow whatever can be encoded - hope you know what you are doing.
+ if (!IsContainableImmed(tree, shiftBy) || (shiftBy->gtIntConCommon.IconValue() > 255) ||
+ (shiftBy->gtIntConCommon.IconValue() < 0))
+ {
+ source->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
+ shiftBy->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
+ info->setDstCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
+ }
+ else
+ {
+ MakeSrcContained(tree, shiftBy);
+
+ // Note that Rotate Left/Right instructions don't set ZF and SF flags.
+ //
+ // If the operand being shifted is 32-bits then upper three bits are masked
+ // by hardware to get actual shift count. Similarly for 64-bit operands
+ // shift count is narrowed to [0..63]. If the resulting shift count is zero,
+ // then shift operation won't modify flags.
+ //
+ // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
+ // if the shift count is known to be non-zero and in the range depending on the
+ // operand size.
+ }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCall: Set the NodeInfo for a call.
+//
+// Arguments:
+// call - The call node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
+{
+ TreeNodeInfo* info = &(call->gtLsraInfo);
+ LinearScan* l = m_lsra;
+ Compiler* compiler = comp;
+ bool hasMultiRegRetVal = false;
+ ReturnTypeDesc* retTypeDesc = nullptr;
+
+ info->srcCount = 0;
+ if (call->TypeGet() != TYP_VOID)
+ {
+ hasMultiRegRetVal = call->HasMultiRegRetVal();
+ if (hasMultiRegRetVal)
+ {
+ // dst count = number of registers in which the value is returned by call
+ retTypeDesc = call->GetReturnTypeDesc();
+ info->dstCount = retTypeDesc->GetReturnRegCount();
+ }
+ else
+ {
+ info->dstCount = 1;
+ }
+ }
+ else
+ {
+ info->dstCount = 0;
+ }
+
+ GenTree* ctrlExpr = call->gtControlExpr;
+ if (call->gtCallType == CT_INDIRECT)
+ {
+ // either gtControlExpr != null or gtCallAddr != null.
+ // Both cannot be non-null at the same time.
+ assert(ctrlExpr == nullptr);
+ assert(call->gtCallAddr != nullptr);
+ ctrlExpr = call->gtCallAddr;
+
+#ifdef _TARGET_X86_
+ // Fast tail calls aren't currently supported on x86, but if they ever are, the code
+ // below that handles indirect VSD calls will need to be fixed.
+ assert(!call->IsFastTailCall() || !call->IsVirtualStub());
+#endif // _TARGET_X86_
+ }
+
+ // set reg requirements on call target represented as control sequence.
+ if (ctrlExpr != nullptr)
+ {
+ // we should never see a gtControlExpr whose type is void.
+ assert(ctrlExpr->TypeGet() != TYP_VOID);
+
+ // call can take a Rm op on x64
+ info->srcCount++;
+
+ // In case of fast tail implemented as jmp, make sure that gtControlExpr is
+ // computed into a register.
+ if (!call->IsFastTailCall())
+ {
+#ifdef _TARGET_X86_
+ // On x86, we need to generate a very specific pattern for indirect VSD calls:
+ //
+ // 3-byte nop
+ // call dword ptr [eax]
+ //
+ // Where EAX is also used as an argument to the stub dispatch helper. Make
+ // sure that the call target address is computed into EAX in this case.
+ if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
+ {
+ assert(ctrlExpr->isIndir());
+
+ ctrlExpr->gtGetOp1()->gtLsraInfo.setSrcCandidates(l, RBM_VIRTUAL_STUB_TARGET);
+ MakeSrcContained(call, ctrlExpr);
+ }
+ else
+#endif // _TARGET_X86_
+ if (ctrlExpr->isIndir())
+ {
+ MakeSrcContained(call, ctrlExpr);
+ }
+ }
+ else
+ {
+ // Fast tail call - make sure that call target is always computed in RAX
+ // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
+ ctrlExpr->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+ }
+ }
+
+ // If this is a varargs call, we will clear the internal candidates in case we need
+ // to reserve some integer registers for copying float args.
+ // We have to do this because otherwise the default candidates are allRegs, and adding
+ // the individual specific registers will have no effect.
+ if (call->IsVarargs())
+ {
+ info->setInternalCandidates(l, RBM_NONE);
+ }
+
+ RegisterType registerType = call->TypeGet();
+
+ // Set destination candidates for return value of the call.
+ CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef _TARGET_X86_
+ if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
+ {
+ // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
+ // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
+ // correct argument registers.
+ info->setDstCandidates(l, RBM_PINVOKE_TCB);
+ }
+ else
+#endif // _TARGET_X86_
+ if (hasMultiRegRetVal)
+ {
+ assert(retTypeDesc != nullptr);
+ info->setDstCandidates(l, retTypeDesc->GetABIReturnRegs());
+ }
+ else if (varTypeIsFloating(registerType))
+ {
+#ifdef _TARGET_X86_
+ // The return value will be on the X87 stack, and we will need to move it.
+ info->setDstCandidates(l, l->allRegs(registerType));
+#else // !_TARGET_X86_
+ info->setDstCandidates(l, RBM_FLOATRET);
+#endif // !_TARGET_X86_
+ }
+ else if (registerType == TYP_LONG)
+ {
+ info->setDstCandidates(l, RBM_LNGRET);
+ }
+ else
+ {
+ info->setDstCandidates(l, RBM_INTRET);
+ }
+
+ // number of args to a call =
+ // callRegArgs + (callargs - placeholders, setup, etc)
+ // there is an explicit thisPtr but it is redundant
+
+ // If there is an explicit this pointer, we don't want that node to produce anything
+ // as it is redundant
+ if (call->gtCallObjp != nullptr)
+ {
+ GenTreePtr thisPtrNode = call->gtCallObjp;
+
+ if (thisPtrNode->gtOper == GT_PUTARG_REG)
+ {
+ l->clearOperandCounts(thisPtrNode);
+ l->clearDstCount(thisPtrNode->gtOp.gtOp1);
+ }
+ else
+ {
+ l->clearDstCount(thisPtrNode);
+ }
+ }
+
+#if FEATURE_VARARG
+ bool callHasFloatRegArgs = false;
+#endif // !FEATURE_VARARG
+
+ // First, count reg args
+ for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+ {
+ assert(list->OperIsList());
+
+ GenTreePtr argNode = list->Current();
+
+ fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
+ assert(curArgTabEntry);
+
+ if (curArgTabEntry->regNum == REG_STK)
+ {
+ // late arg that is not passed in a register
+ DISPNODE(argNode);
+ assert(argNode->gtOper == GT_PUTARG_STK);
+ argNode->gtLsraInfo.srcCount = 1;
+ argNode->gtLsraInfo.dstCount = 0;
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+ // If the node is TYP_STRUCT and it is put on stack with
+ // putarg_stk operation, we consume and produce no registers.
+ // In this case the embedded Obj node should not produce
+ // registers too since it is contained.
+ // Note that if it is a SIMD type the argument will be in a register.
+ if (argNode->TypeGet() == TYP_STRUCT)
+ {
+ assert(argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_OBJ);
+ argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0;
+ argNode->gtLsraInfo.srcCount = 0;
+ }
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+ continue;
+ }
+
+ regNumber argReg = REG_NA;
+ regMaskTP argMask = RBM_NONE;
+ short regCount = 0;
+ bool isOnStack = true;
+ if (curArgTabEntry->regNum != REG_STK)
+ {
+ isOnStack = false;
+ var_types argType = argNode->TypeGet();
+
+#if FEATURE_VARARG
+ callHasFloatRegArgs |= varTypeIsFloating(argType);
+#endif // !FEATURE_VARARG
+
+ argReg = curArgTabEntry->regNum;
+ regCount = 1;
+
+ // Default case is that we consume one source; modify this later (e.g. for
+ // promoted structs)
+ info->srcCount++;
+
+ argMask = genRegMask(argReg);
+ argNode = argNode->gtEffectiveVal();
+ }
+
+ // If the struct arg is wrapped in CPYBLK the type of the param will be TYP_VOID.
+ // Use the curArgTabEntry's isStruct to get whether the param is a struct.
+ if (varTypeIsStruct(argNode) PUT_STRUCT_ARG_STK_ONLY(|| curArgTabEntry->isStruct))
+ {
+ unsigned originalSize = 0;
+ LclVarDsc* varDsc = nullptr;
+ if (argNode->gtOper == GT_LCL_VAR)
+ {
+ varDsc = compiler->lvaTable + argNode->gtLclVarCommon.gtLclNum;
+ originalSize = varDsc->lvSize();
+ }
+ else if (argNode->gtOper == GT_MKREFANY)
+ {
+ originalSize = 2 * TARGET_POINTER_SIZE;
+ }
+ else if (argNode->gtOper == GT_OBJ)
+ {
+ noway_assert(!"GT_OBJ not supported for amd64");
+ }
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+ else if (argNode->gtOper == GT_PUTARG_REG)
+ {
+ originalSize = genTypeSize(argNode->gtType);
+ }
+ else if (argNode->gtOper == GT_FIELD_LIST)
+ {
+ originalSize = 0;
+
+ // There could be up to 2 PUTARG_REGs in the list
+ GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
+ unsigned iterationNum = 0;
+ for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest())
+ {
+ GenTreePtr putArgRegNode = fieldListPtr->Current();
+ assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+
+ if (iterationNum == 0)
+ {
+ varDsc = compiler->lvaTable + putArgRegNode->gtOp.gtOp1->gtLclVarCommon.gtLclNum;
+ originalSize = varDsc->lvSize();
+ assert(originalSize != 0);
+ }
+ else
+ {
+ // Need an extra source for every node, but the first in the list.
+ info->srcCount++;
+
+ // Get the mask for the second putarg_reg
+ argMask = genRegMask(curArgTabEntry->otherRegNum);
+ }
+
+ putArgRegNode->gtLsraInfo.setDstCandidates(l, argMask);
+ putArgRegNode->gtLsraInfo.setSrcCandidates(l, argMask);
+
+ // To avoid redundant moves, have the argument child tree computed in the
+ // register in which the argument is passed to the call.
+ putArgRegNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(putArgRegNode));
+ iterationNum++;
+ }
+
+ assert(iterationNum <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+ }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+ else
+ {
+ noway_assert(!"Can't predict unsupported TYP_STRUCT arg kind");
+ }
+
+ unsigned slots = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES;
+ unsigned remainingSlots = slots;
+
+ if (!isOnStack)
+ {
+ remainingSlots = slots - 1;
+
+ regNumber reg = (regNumber)(argReg + 1);
+ while (remainingSlots > 0 && reg <= REG_ARG_LAST)
+ {
+ argMask |= genRegMask(reg);
+ reg = (regNumber)(reg + 1);
+ remainingSlots--;
+ regCount++;
+ }
+ }
+
+ short internalIntCount = 0;
+ if (remainingSlots > 0)
+ {
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+ // This TYP_STRUCT argument is also passed in the outgoing argument area
+ // We need a register to address the TYP_STRUCT
+ internalIntCount = 1;
+#else // FEATURE_UNIX_AMD64_STRUCT_PASSING
+ // And we may need 2
+ internalIntCount = 2;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+ }
+ argNode->gtLsraInfo.internalIntCount = internalIntCount;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+ if (argNode->gtOper == GT_PUTARG_REG)
+ {
+ argNode->gtLsraInfo.setDstCandidates(l, argMask);
+ argNode->gtLsraInfo.setSrcCandidates(l, argMask);
+ }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+ }
+ else
+ {
+ argNode->gtLsraInfo.setDstCandidates(l, argMask);
+ argNode->gtLsraInfo.setSrcCandidates(l, argMask);
+ }
+
+ // To avoid redundant moves, have the argument child tree computed in the
+ // register in which the argument is passed to the call.
+ if (argNode->gtOper == GT_PUTARG_REG)
+ {
+ argNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(argNode));
+ }
+
+#if FEATURE_VARARG
+ // In the case of a varargs call, the ABI dictates that if we have floating point args,
+ // we must pass the enregistered arguments in both the integer and floating point registers.
+ // Since the integer register is not associated with this arg node, we will reserve it as
+ // an internal register so that it is not used during the evaluation of the call node
+ // (e.g. for the target).
+ if (call->IsVarargs() && varTypeIsFloating(argNode))
+ {
+ regNumber targetReg = compiler->getCallArgIntRegister(argReg);
+ info->setInternalIntCount(info->internalIntCount + 1);
+ info->addInternalCandidates(l, genRegMask(targetReg));
+ }
+#endif // FEATURE_VARARG
+ }
+
+ // Now, count stack args
+ // Note that these need to be computed into a register, but then
+ // they're just stored to the stack - so the reg doesn't
+ // need to remain live until the call. In fact, it must not
+ // because the code generator doesn't actually consider it live,
+ // so it can't be spilled.
+
+ GenTreePtr args = call->gtCallArgs;
+ while (args)
+ {
+ GenTreePtr arg = args->gtOp.gtOp1;
+ if (!(args->gtFlags & GTF_LATE_ARG))
+ {
+ TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
+ if (argInfo->dstCount != 0)
+ {
+ argInfo->isLocalDefUse = true;
+ }
+
+ // If the child of GT_PUTARG_STK is a constant, we don't need a register to
+ // move it to memory (stack location).
+ //
+ // On AMD64, we don't want to make 0 contained, because we can generate smaller code
+ // by zeroing a register and then storing it. E.g.:
+ // xor rdx, rdx
+ // mov gword ptr [rsp+28H], rdx
+ // is 2 bytes smaller than:
+ // mov gword ptr [rsp+28H], 0
+ //
+ // On x86, we push stack arguments; we don't use 'mov'. So:
+ // push 0
+ // is 1 byte smaller than:
+ // xor rdx, rdx
+ // push rdx
+
+ argInfo->dstCount = 0;
+ if (arg->gtOper == GT_PUTARG_STK)
+ {
+ GenTree* op1 = arg->gtOp.gtOp1;
+ if (IsContainableImmed(arg, op1)
+#if defined(_TARGET_AMD64_)
+ && !op1->IsIntegralConst(0)
+#endif // _TARGET_AMD64_
+ )
+ {
+ MakeSrcContained(arg, op1);
+ }
+ }
+ }
+ args = args->gtOp.gtOp2;
+ }
+
+#if FEATURE_VARARG
+ // If it is a fast tail call, it is already preferenced to use RAX.
+ // Therefore, no need set src candidates on call tgt again.
+ if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExpr != nullptr))
+ {
+ // Don't assign the call target to any of the argument registers because
+ // we will use them to also pass floating point arguments as required
+ // by Amd64 ABI.
+ ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS));
+ }
+#endif // !FEATURE_VARARG
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitBlockStore: Set the NodeInfo for a block store.
+//
+// Arguments:
+// blkNode - The block store node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
+{
+ GenTree* dstAddr = blkNode->Addr();
+ unsigned size = blkNode->gtBlkSize;
+ GenTree* source = blkNode->Data();
+ LinearScan* l = m_lsra;
+ Compiler* compiler = comp;
+
+ // Sources are dest address, initVal or source.
+ // We may require an additional source or temp register for the size.
+ blkNode->gtLsraInfo.srcCount = 2;
+ blkNode->gtLsraInfo.dstCount = 0;
+ blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE);
+ GenTreePtr srcAddrOrFill = nullptr;
+ bool isInitBlk = blkNode->OperIsInitBlkOp();
+
+ regMaskTP dstAddrRegMask = RBM_NONE;
+ regMaskTP sourceRegMask = RBM_NONE;
+ regMaskTP blkSizeRegMask = RBM_NONE;
+
+ if (isInitBlk)
+ {
+ GenTree* initVal = source;
+ if (initVal->OperIsInitVal())
+ {
+ initVal = initVal->gtGetOp1();
+ }
+ srcAddrOrFill = initVal;
+
+ switch (blkNode->gtBlkOpKind)
+ {
+ case GenTreeBlk::BlkOpKindUnroll:
+ assert(initVal->IsCnsIntOrI());
+ if (size >= XMM_REGSIZE_BYTES)
+ {
+ // Reserve an XMM register to fill it with
+ // a pack of 16 init value constants.
+ ssize_t fill = initVal->gtIntCon.gtIconVal & 0xFF;
+ blkNode->gtLsraInfo.internalFloatCount = 1;
+ blkNode->gtLsraInfo.setInternalCandidates(l, l->internalFloatRegCandidates());
+ if ((fill == 0) && ((size & 0xf) == 0))
+ {
+ MakeSrcContained(blkNode, source);
+ }
+ // use XMM register to fill with constants, it's AVX instruction and set the flag
+ SetContainsAVXFlags();
+ }
+#ifdef _TARGET_X86_
+ if ((size & 1) != 0)
+ {
+ // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
+ // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
+ // when unrolling, so only allow byteable registers as the source value. (We could
+ // consider just using BlkOpKindRepInstr instead.)
+ sourceRegMask = RBM_BYTE_REGS;
+ }
+#endif // _TARGET_X86_
+ break;
+
+ case GenTreeBlk::BlkOpKindRepInstr:
+ // rep stos has the following register requirements:
+ // a) The memory address to be in RDI.
+ // b) The fill value has to be in RAX.
+ // c) The buffer size will go in RCX.
+ dstAddrRegMask = RBM_RDI;
+ srcAddrOrFill = initVal;
+ sourceRegMask = RBM_RAX;
+ blkSizeRegMask = RBM_RCX;
+ break;
+
+ case GenTreeBlk::BlkOpKindHelper:
+#ifdef _TARGET_AMD64_
+ // The helper follows the regular AMD64 ABI.
+ dstAddrRegMask = RBM_ARG_0;
+ sourceRegMask = RBM_ARG_1;
+ blkSizeRegMask = RBM_ARG_2;
+#else // !_TARGET_AMD64_
+ dstAddrRegMask = RBM_RDI;
+ sourceRegMask = RBM_RAX;
+ blkSizeRegMask = RBM_RCX;
+#endif // !_TARGET_AMD64_
+ break;
+
+ default:
+ unreached();
+ }
+ }
+ else
+ {
+ // CopyObj or CopyBlk
+ if (source->gtOper == GT_IND)
+ {
+ srcAddrOrFill = blkNode->Data()->gtGetOp1();
+ // We're effectively setting source as contained, but can't call MakeSrcContained, because the
+ // "inheritance" of the srcCount is to a child not a parent - it would "just work" but could be misleading.
+ // If srcAddr is already non-contained, we don't need to change it.
+ if (srcAddrOrFill->gtLsraInfo.getDstCount() == 0)
+ {
+ srcAddrOrFill->gtLsraInfo.setDstCount(1);
+ srcAddrOrFill->gtLsraInfo.setSrcCount(source->gtLsraInfo.srcCount);
+ }
+ m_lsra->clearOperandCounts(source);
+ }
+ else if (!source->IsMultiRegCall() && !source->OperIsSIMD())
+ {
+ assert(source->IsLocal());
+ MakeSrcContained(blkNode, source);
+ }
+ if (blkNode->OperGet() == GT_STORE_OBJ)
+ {
+ if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
+ {
+ // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
+ blkSizeRegMask = RBM_RCX;
+ }
+ // The srcAddr must be in a register. If it was under a GT_IND, we need to subsume all of its
+ // sources.
+ sourceRegMask = RBM_RSI;
+ dstAddrRegMask = RBM_RDI;
+ }
+ else
+ {
+ switch (blkNode->gtBlkOpKind)
+ {
+ case GenTreeBlk::BlkOpKindUnroll:
+ // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
+ //
+ // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
+ // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude
+ // RBM_NON_BYTE_REGS from internal candidates.
+ if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
+ {
+ blkNode->gtLsraInfo.internalIntCount++;
+ regMaskTP regMask = l->allRegs(TYP_INT);
+
+#ifdef _TARGET_X86_
+ if ((size & 1) != 0)
+ {
+ regMask &= ~RBM_NON_BYTE_REGS;
+ }
+#endif
+ blkNode->gtLsraInfo.setInternalCandidates(l, regMask);
+ }
+
+ if (size >= XMM_REGSIZE_BYTES)
+ {
+ // If we have a buffer larger than XMM_REGSIZE_BYTES,
+ // reserve an XMM register to use it for a
+ // series of 16-byte loads and stores.
+ blkNode->gtLsraInfo.internalFloatCount = 1;
+ blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
+ // Uses XMM reg for load and store and hence check to see whether AVX instructions
+ // are used for codegen, set ContainsAVX flag
+ SetContainsAVXFlags();
+ }
+ // If src or dst are on stack, we don't have to generate the address
+ // into a register because it's just some constant+SP.
+ if ((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr())
+ {
+ MakeSrcContained(blkNode, srcAddrOrFill);
+ }
+
+ if (dstAddr->OperIsLocalAddr())
+ {
+ MakeSrcContained(blkNode, dstAddr);
+ }
+
+ break;
+
+ case GenTreeBlk::BlkOpKindRepInstr:
+ // rep stos has the following register requirements:
+ // a) The dest address has to be in RDI.
+ // b) The src address has to be in RSI.
+ // c) The buffer size will go in RCX.
+ dstAddrRegMask = RBM_RDI;
+ sourceRegMask = RBM_RSI;
+ blkSizeRegMask = RBM_RCX;
+ break;
+
+ case GenTreeBlk::BlkOpKindHelper:
+#ifdef _TARGET_AMD64_
+ // The helper follows the regular AMD64 ABI.
+ dstAddrRegMask = RBM_ARG_0;
+ sourceRegMask = RBM_ARG_1;
+ blkSizeRegMask = RBM_ARG_2;
+#else // !_TARGET_AMD64_
+ dstAddrRegMask = RBM_RDI;
+ sourceRegMask = RBM_RAX;
+ blkSizeRegMask = RBM_RCX;
+#endif // !_TARGET_AMD64_
+ break;
+
+ default:
+ unreached();
+ }
+ }
+ }
+
+ if (dstAddrRegMask != RBM_NONE)
+ {
+ dstAddr->gtLsraInfo.setSrcCandidates(l, dstAddrRegMask);
+ }
+ if (sourceRegMask != RBM_NONE)
+ {
+ if (srcAddrOrFill != nullptr)
+ {
+ srcAddrOrFill->gtLsraInfo.setSrcCandidates(l, sourceRegMask);
+ }
+ else
+ {
+ // This is a local source; we'll use a temp register for its address.
+ blkNode->gtLsraInfo.addInternalCandidates(l, sourceRegMask);
+ blkNode->gtLsraInfo.internalIntCount++;
+ }
+ }
+ if (blkSizeRegMask != RBM_NONE)
+ {
+ if (size != 0)
+ {
+ // Reserve a temp register for the block size argument.
+ blkNode->gtLsraInfo.addInternalCandidates(l, blkSizeRegMask);
+ blkNode->gtLsraInfo.internalIntCount++;
+ }
+ else
+ {
+ // The block size argument is a third argument to GT_STORE_DYN_BLK
+ noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK);
+ blkNode->gtLsraInfo.setSrcCount(3);
+ GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
+ blockSize->gtLsraInfo.setSrcCandidates(l, blkSizeRegMask);
+ }
+ }
+}
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+//------------------------------------------------------------------------
+// TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
+{
+ TreeNodeInfo* info = &(putArgStk->gtLsraInfo);
+ LinearScan* l = m_lsra;
+ info->srcCount = 0;
+
+#ifdef _TARGET_X86_
+ if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
+ {
+ unsigned fieldCount = 0;
+ bool needsByteTemp = false;
+ bool needsSimdTemp = false;
+ unsigned prevOffset = putArgStk->getArgSize();
+ for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
+ {
+ GenTree* const fieldNode = current->Current();
+ const var_types fieldType = fieldNode->TypeGet();
+ const unsigned fieldOffset = current->gtFieldOffset;
+ assert(fieldType != TYP_LONG);
+ info->srcCount++;
+
+ // For x86 we must mark all integral fields as contained or reg-optional, and handle them
+ // accordingly in code generation, since we may have up to 8 fields, which cannot all be in
+ // registers to be consumed atomically by the call.
+ if (varTypeIsIntegralOrI(fieldNode))
+ {
+ if (fieldNode->OperGet() == GT_LCL_VAR)
+ {
+ LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]);
+ if (varDsc->lvTracked && !varDsc->lvDoNotEnregister)
+ {
+ SetRegOptional(fieldNode);
+ }
+ else
+ {
+ MakeSrcContained(putArgStk, fieldNode);
+ }
+ }
+ else if (fieldNode->IsIntCnsFitsInI32())
+ {
+ MakeSrcContained(putArgStk, fieldNode);
+ }
+ else
+ {
+ // For the case where we cannot directly push the value, if we run out of registers,
+ // it would be better to defer computation until we are pushing the arguments rather
+ // than spilling, but this situation is not all that common, as most cases of promoted
+ // structs do not have a large number of fields, and of those most are lclVars or
+ // copy-propagated constants.
+ SetRegOptional(fieldNode);
+ }
+ }
+#if defined(FEATURE_SIMD)
+ // Note that we need to check the GT_FIELD_LIST type, not the fieldType. This is because the
+ // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
+ // we "round up" to 16.
+ else if (current->gtFieldType == TYP_SIMD12)
+ {
+ needsSimdTemp = true;
+ }
+#endif // defined(FEATURE_SIMD)
+ else
+ {
+ assert(varTypeIsFloating(fieldNode) || varTypeIsSIMD(fieldNode));
+ }
+
+ // We can treat as a slot any field that is stored at a slot boundary, where the previous
+ // field is not in the same slot. (Note that we store the fields in reverse order.)
+ const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
+ if (!fieldIsSlot)
+ {
+ if (varTypeIsByte(fieldType))
+ {
+ // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
+ // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
+ // need a byte-addressable register for the store. We will enforce this requirement on an internal
+ // register, which we can use to copy multiple byte values.
+ needsByteTemp = true;
+ }
+ }
+
+ if (varTypeIsGC(fieldType))
+ {
+ putArgStk->gtNumberReferenceSlots++;
+ }
+ prevOffset = fieldOffset;
+ fieldCount++;
+ }
+
+ info->dstCount = 0;
+
+ if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
+ {
+ // If any of the fields cannot be stored with an actual push, we may need a temporary
+ // register to load the value before storing it to the stack location.
+ info->internalIntCount = 1;
+ regMaskTP regMask = l->allRegs(TYP_INT);
+ if (needsByteTemp)
+ {
+ regMask &= ~RBM_NON_BYTE_REGS;
+ }
+ info->setInternalCandidates(l, regMask);
+ }
+
+#if defined(FEATURE_SIMD)
+ // For PutArgStk of a TYP_SIMD12, we need a SIMD temp register.
+ if (needsSimdTemp)
+ {
+ info->internalFloatCount += 1;
+ info->addInternalCandidates(l, l->allSIMDRegs());
+ }
+#endif // defined(FEATURE_SIMD)
+
+ return;
+ }
+#endif // _TARGET_X86_
+
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+ // For PutArgStk of a TYP_SIMD12, we need an extra register.
+ if (putArgStk->TypeGet() == TYP_SIMD12)
+ {
+ info->srcCount = putArgStk->gtOp1->gtLsraInfo.dstCount;
+ info->dstCount = 0;
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(l, l->allSIMDRegs());
+ return;
+ }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+
+ if (putArgStk->TypeGet() != TYP_STRUCT)
+ {
+ TreeNodeInfoInitSimple(putArgStk);
+ return;
+ }
+
+ GenTreePtr dst = putArgStk;
+ GenTreePtr src = putArgStk->gtOp1;
+ GenTreePtr srcAddr = nullptr;
+
+ bool haveLocalAddr = false;
+ if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
+ {
+ srcAddr = src->gtOp.gtOp1;
+ assert(srcAddr != nullptr);
+ haveLocalAddr = srcAddr->OperIsLocalAddr();
+ }
+ else
+ {
+ assert(varTypeIsSIMD(putArgStk));
+ }
+
+ info->srcCount = src->gtLsraInfo.dstCount;
+ info->dstCount = 0;
+
+ // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
+ // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
+ // our framework assemblies, so this is the main code generation scheme we'll use.
+ ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
+ switch (putArgStk->gtPutArgStkKind)
+ {
+ case GenTreePutArgStk::Kind::Push:
+ case GenTreePutArgStk::Kind::PushAllSlots:
+ case GenTreePutArgStk::Kind::Unroll:
+ // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
+ //
+ // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
+ // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude
+ // RBM_NON_BYTE_REGS from internal candidates.
+ if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0)
+ {
+ info->internalIntCount++;
+ regMaskTP regMask = l->allRegs(TYP_INT);
+
+#ifdef _TARGET_X86_
+ if ((size % 2) != 0)
+ {
+ regMask &= ~RBM_NON_BYTE_REGS;
+ }
+#endif
+ info->setInternalCandidates(l, regMask);
+ }
+
+#ifdef _TARGET_X86_
+ if (size >= 8)
+#else // !_TARGET_X86_
+ if (size >= XMM_REGSIZE_BYTES)
+#endif // !_TARGET_X86_
+ {
+ // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
+ // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
+ // series of 16-byte loads and stores.
+ info->internalFloatCount = 1;
+ info->addInternalCandidates(l, l->internalFloatRegCandidates());
+ SetContainsAVXFlags();
+ }
+ break;
+
+ case GenTreePutArgStk::Kind::RepInstr:
+ info->internalIntCount += 3;
+ info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI));
+ break;
+
+ default:
+ unreached();
+ }
+
+ // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
+ MakeSrcContained(putArgStk, src);
+
+ if (haveLocalAddr)
+ {
+ // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
+ // copies.
+ //
+ // To avoid an assertion in MakeSrcContained, increment the parent's source count beforehand and decrement it
+ // afterwards.
+ info->srcCount++;
+ MakeSrcContained(putArgStk, srcAddr);
+ info->srcCount--;
+ }
+}
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitLclHeap: Set the NodeInfo for a GT_LCLHEAP.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
+{
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+ LinearScan* l = m_lsra;
+ Compiler* compiler = comp;
+
+ info->srcCount = 1;
+ info->dstCount = 1;
+
+ // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
+ // Here '-' means don't care.
+ //
+ // Size? Init Memory? # temp regs
+ // 0 - 0 (returns 0)
+ // const and <=6 reg words - 0 (pushes '0')
+ // const and >6 reg words Yes 0 (pushes '0')
+ // const and <PageSize No 0 (amd64) 1 (x86)
+ // (x86:tmpReg for sutracting from esp)
+ // const and >=PageSize No 2 (regCnt and tmpReg for subtracing from sp)
+ // Non-const Yes 0 (regCnt=targetReg and pushes '0')
+ // Non-const No 2 (regCnt and tmpReg for subtracting from sp)
+ //
+ // Note: Here we don't need internal register to be different from targetReg.
+ // Rather, require it to be different from operand's reg.
+
+ GenTreePtr size = tree->gtOp.gtOp1;
+ if (size->IsCnsIntOrI())
+ {
+ MakeSrcContained(tree, size);
+
+ size_t sizeVal = size->gtIntCon.gtIconVal;
+
+ if (sizeVal == 0)
+ {
+ info->internalIntCount = 0;
+ }
+ else
+ {
+ // Compute the amount of memory to properly STACK_ALIGN.
+ // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
+ // This should also help in debugging as we can examine the original size specified with localloc.
+ sizeVal = AlignUp(sizeVal, STACK_ALIGN);
+
+ // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
+ // we will generate 'push 0'.
+ assert((sizeVal % REGSIZE_BYTES) == 0);
+ size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
+ if (cntRegSizedWords <= 6)
+ {
+ info->internalIntCount = 0;
+ }
+ else if (!compiler->info.compInitMem)
+ {
+ // No need to initialize allocated stack space.
+ if (sizeVal < compiler->eeGetPageSize())
+ {
+#ifdef _TARGET_X86_
+ info->internalIntCount = 1; // x86 needs a register here to avoid generating "sub" on ESP.
+#else // !_TARGET_X86_
+ info->internalIntCount = 0;
+#endif // !_TARGET_X86_
+ }
+ else
+ {
+ // We need two registers: regCnt and RegTmp
+ info->internalIntCount = 2;
+ }
+ }
+ else
+ {
+ // >6 and need to zero initialize allocated stack space.
+ info->internalIntCount = 0;
+ }
+ }
+ }
+ else
+ {
+ if (!compiler->info.compInitMem)
+ {
+ info->internalIntCount = 2;
+ }
+ else
+ {
+ info->internalIntCount = 0;
+ }
+ }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitLogicalOp: Set the NodeInfo for GT_AND/GT_OR/GT_XOR,
+// as well as GT_ADD/GT_SUB.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitLogicalOp(GenTree* tree)
+{
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+ LinearScan* l = m_lsra;
+
+ // We're not marking a constant hanging on the left of the add
+ // as containable so we assign it to a register having CQ impact.
+ // TODO-XArch-CQ: Detect this case and support both generating a single instruction
+ // for GT_ADD(Constant, SomeTree)
+ info->srcCount = 2;
+ info->dstCount = 1;
+
+ GenTree* op1 = tree->gtGetOp1();
+ GenTree* op2 = tree->gtGetOp2();
+
+ // We can directly encode the second operand if it is either a containable constant or a memory-op.
+ // In case of memory-op, we can encode it directly provided its type matches with 'tree' type.
+ // This is because during codegen, type of 'tree' is used to determine emit Type size. If the types
+ // do not match, they get normalized (i.e. sign/zero extended) on load into a register.
+ bool directlyEncodable = false;
+ bool binOpInRMW = false;
+ GenTreePtr operand = nullptr;
+
+ if (IsContainableImmed(tree, op2))
+ {
+ directlyEncodable = true;
+ operand = op2;
+ }
+ else
+ {
+ binOpInRMW = IsBinOpInRMWStoreInd(tree);
+ if (!binOpInRMW)
+ {
+ if (op2->isMemoryOp() && tree->TypeGet() == op2->TypeGet())
+ {
+ directlyEncodable = true;
+ operand = op2;
+ }
+ else if (tree->OperIsCommutative())
+ {
+ if (IsContainableImmed(tree, op1) ||
+ (op1->isMemoryOp() && tree->TypeGet() == op1->TypeGet() && IsSafeToContainMem(tree, op1)))
+ {
+ // If it is safe, we can reverse the order of operands of commutative operations for efficient
+ // codegen
+ directlyEncodable = true;
+ operand = op1;
+ }
+ }
+ }
+ }
+
+ if (directlyEncodable)
+ {
+ assert(operand != nullptr);
+ MakeSrcContained(tree, operand);
+ }
+ else if (!binOpInRMW)
+ {
+ // If this binary op neither has contained operands, nor is a
+ // Read-Modify-Write (RMW) operation, we can mark its operands
+ // as reg optional.
+ SetRegOptionalForBinOp(tree);
+ }
+
+ // Codegen of this tree node sets ZF and SF flags.
+ tree->gtFlags |= GTF_ZSF_SET;
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitModDiv(GenTree* tree)
+{
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+ LinearScan* l = m_lsra;
+
+ GenTree* op1 = tree->gtGetOp1();
+ GenTree* op2 = tree->gtGetOp2();
+
+ info->srcCount = 2;
+ info->dstCount = 1;
+
+ switch (tree->OperGet())
+ {
+ case GT_MOD:
+ case GT_DIV:
+ if (varTypeIsFloating(tree->TypeGet()))
+ {
+ // No implicit conversions at this stage as the expectation is that
+ // everything is made explicit by adding casts.
+ assert(op1->TypeGet() == op2->TypeGet());
+
+ if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
+ {
+ MakeSrcContained(tree, op2);
+ }
+ else
+ {
+ // If there are no containable operands, we can make an operand reg optional.
+ // SSE2 allows only op2 to be a memory-op.
+ SetRegOptional(op2);
+ }
+
+ return;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ // Amd64 Div/Idiv instruction:
+ // Dividend in RAX:RDX and computes
+ // Quotient in RAX, Remainder in RDX
+
+ if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
+ {
+ // We are interested in just the remainder.
+ // RAX is used as a trashable register during computation of remainder.
+ info->setDstCandidates(l, RBM_RDX);
+ }
+ else
+ {
+ // We are interested in just the quotient.
+ // RDX gets used as trashable register during computation of quotient
+ info->setDstCandidates(l, RBM_RAX);
+ }
+
+ bool op2CanBeRegOptional = true;
+#ifdef _TARGET_X86_
+ if (op1->OperGet() == GT_LONG)
+ {
+ // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
+ GenTree* loVal = op1->gtGetOp1();
+ GenTree* hiVal = op1->gtGetOp2();
+
+ // Src count is actually 3, so increment.
+ assert(op2->IsCnsIntOrI());
+ assert(tree->OperGet() == GT_UMOD);
+ info->srcCount++;
+ op2CanBeRegOptional = false;
+
+ // This situation also requires an internal register.
+ info->internalIntCount = 1;
+ info->setInternalCandidates(l, l->allRegs(TYP_INT));
+
+ loVal->gtLsraInfo.setSrcCandidates(l, RBM_EAX);
+ hiVal->gtLsraInfo.setSrcCandidates(l, RBM_EDX);
+ }
+ else
+#endif
+ {
+ // If possible would like to have op1 in RAX to avoid a register move
+ op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+ }
+
+ // divisor can be an r/m, but the memory indirection must be of the same size as the divide
+ if (op2->isMemoryOp() && (op2->TypeGet() == tree->TypeGet()))
+ {
+ MakeSrcContained(tree, op2);
+ }
+ else if (op2CanBeRegOptional)
+ {
+ op2->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
+
+ // If there are no containable operands, we can make an operand reg optional.
+ // Div instruction allows only op2 to be a memory op.
+ SetRegOptional(op2);
+ }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitIntrinsic(GenTree* tree)
+{
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+ LinearScan* l = m_lsra;
+
+ // Both operand and its result must be of floating point type.
+ GenTree* op1 = tree->gtGetOp1();
+ assert(varTypeIsFloating(op1));
+ assert(op1->TypeGet() == tree->TypeGet());
+
+ info->srcCount = 1;
+ info->dstCount = 1;
+
+ switch (tree->gtIntrinsic.gtIntrinsicId)
+ {
+ case CORINFO_INTRINSIC_Sqrt:
+ if (op1->isMemoryOp() || op1->IsCnsNonZeroFltOrDbl())
+ {
+ MakeSrcContained(tree, op1);
+ }
+ else
+ {
+ // Mark the operand as reg optional since codegen can still
+ // generate code if op1 is on stack.
+ SetRegOptional(op1);
+ }
+ break;
+
+ case CORINFO_INTRINSIC_Abs:
+ // Abs(float x) = x & 0x7fffffff
+ // Abs(double x) = x & 0x7ffffff ffffffff
+
+ // In case of Abs we need an internal register to hold mask.
+
+ // TODO-XArch-CQ: avoid using an internal register for the mask.
+ // Andps or andpd both will operate on 128-bit operands.
+ // The data section constant to hold the mask is a 64-bit size.
+ // Therefore, we need both the operand and mask to be in
+ // xmm register. When we add support in emitter to emit 128-bit
+ // data constants and instructions that operate on 128-bit
+ // memory operands we can avoid the need for an internal register.
+ if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
+ {
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(l, l->internalFloatRegCandidates());
+ }
+ break;
+
+#ifdef _TARGET_X86_
+ case CORINFO_INTRINSIC_Cos:
+ case CORINFO_INTRINSIC_Sin:
+ case CORINFO_INTRINSIC_Round:
+ NYI_X86("Math intrinsics Cos, Sin and Round");
+ break;
+#endif // _TARGET_X86_
+
+ default:
+ // Right now only Sqrt/Abs are treated as math intrinsics
+ noway_assert(!"Unsupported math intrinsic");
+ unreached();
+ break;
+ }
+}
+
+#ifdef FEATURE_SIMD
+//------------------------------------------------------------------------
+// TreeNodeInfoInitSIMD: Set the NodeInfo for a GT_SIMD tree.
+//
+// Arguments:
+// tree - The GT_SIMD node of interest
+//
+// Return Value:
+// None.
+
+void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
+{
+ GenTreeSIMD* simdTree = tree->AsSIMD();
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+ LinearScan* lsra = m_lsra;
+ info->dstCount = 1;
+ SetContainsAVXFlags(true, simdTree->gtSIMDSize);
+ switch (simdTree->gtSIMDIntrinsicID)
+ {
+ GenTree* op1;
+ GenTree* op2;
+
+ case SIMDIntrinsicInit:
+ {
+ op1 = tree->gtOp.gtOp1;
+
+#if !defined(_TARGET_64BIT_)
+ if (op1->OperGet() == GT_LONG)
+ {
+ info->srcCount = 2;
+ }
+ else
+#endif // !defined(_TARGET_64BIT_)
+ {
+ info->srcCount = 1;
+ }
+
+ // This sets all fields of a SIMD struct to the given value.
+ // Mark op1 as contained if it is either zero or int constant of all 1's,
+ // or a float constant with 16 or 32 byte simdType (AVX case)
+ //
+ // Should never see small int base type vectors except for zero initialization.
+ assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
+
+#if !defined(_TARGET_64BIT_)
+ if (op1->OperGet() == GT_LONG)
+ {
+ GenTree* op1lo = op1->gtGetOp1();
+ GenTree* op1hi = op1->gtGetOp2();
+
+ if ((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
+ (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)))
+ {
+ assert(op1->gtLsraInfo.srcCount == 0);
+ assert(op1->gtLsraInfo.dstCount == 0);
+ assert(op1lo->gtLsraInfo.srcCount == 0);
+ assert(op1lo->gtLsraInfo.dstCount == 1);
+ assert(op1hi->gtLsraInfo.srcCount == 0);
+ assert(op1hi->gtLsraInfo.dstCount == 1);
+
+ op1lo->gtLsraInfo.dstCount = 0;
+ op1hi->gtLsraInfo.dstCount = 0;
+ info->srcCount = 0;
+ }
+ else
+ {
+ // need a temp
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ info->isInternalRegDelayFree = true;
+ }
+ }
+ else
+#endif // !defined(_TARGET_64BIT_)
+ if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
+ (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
+ {
+ MakeSrcContained(tree, op1);
+ info->srcCount = 0;
+ }
+ else if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) &&
+ ((simdTree->gtSIMDSize == 16) || (simdTree->gtSIMDSize == 32)))
+ {
+ // Either op1 is a float or dbl constant or an addr
+ if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
+ {
+ MakeSrcContained(tree, op1);
+ info->srcCount = 0;
+ }
+ }
+ }
+ break;
+
+ case SIMDIntrinsicInitN:
+ {
+ info->srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(simdTree->gtSIMDBaseType));
+
+ // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ }
+ break;
+
+ case SIMDIntrinsicInitArray:
+ // We have an array and an index, which may be contained.
+ info->srcCount = 2;
+ CheckImmedAndMakeContained(tree, tree->gtGetOp2());
+ break;
+
+ case SIMDIntrinsicDiv:
+ // SSE2 has no instruction support for division on integer vectors
+ noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
+ info->srcCount = 2;
+ break;
+
+ case SIMDIntrinsicAbs:
+ // float/double vectors: This gets implemented as bitwise-And operation
+ // with a mask and hence should never see here.
+ //
+ // Must be a Vector<int> or Vector<short> Vector<sbyte>
+ assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
+ simdTree->gtSIMDBaseType == TYP_BYTE);
+ assert(comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4);
+ info->srcCount = 1;
+ break;
+
+ case SIMDIntrinsicSqrt:
+ // SSE2 has no instruction support for sqrt on integer vectors.
+ noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
+ info->srcCount = 1;
+ break;
+
+ case SIMDIntrinsicAdd:
+ case SIMDIntrinsicSub:
+ case SIMDIntrinsicMul:
+ case SIMDIntrinsicBitwiseAnd:
+ case SIMDIntrinsicBitwiseAndNot:
+ case SIMDIntrinsicBitwiseOr:
+ case SIMDIntrinsicBitwiseXor:
+ case SIMDIntrinsicMin:
+ case SIMDIntrinsicMax:
+ info->srcCount = 2;
+
+ // SSE2 32-bit integer multiplication requires two temp regs
+ if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
+ comp->getSIMDInstructionSet() == InstructionSet_SSE2)
+ {
+ info->internalFloatCount = 2;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ }
+ break;
+
+ case SIMDIntrinsicEqual:
+ info->srcCount = 2;
+ break;
+
+ // SSE2 doesn't support < and <= directly on int vectors.
+ // Instead we need to use > and >= with swapped operands.
+ case SIMDIntrinsicLessThan:
+ case SIMDIntrinsicLessThanOrEqual:
+ info->srcCount = 2;
+ noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
+ break;
+
+ // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
+ // SSE2 cmpps/pd doesn't support > and >= directly on float/double vectors.
+ // Instead we need to use < and <= with swapped operands.
+ case SIMDIntrinsicGreaterThan:
+ noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
+ info->srcCount = 2;
+ break;
+
+ case SIMDIntrinsicOpEquality:
+ case SIMDIntrinsicOpInEquality:
+ info->srcCount = 2;
+
+ // On SSE4/AVX, we can generate optimal code for (in)equality
+ // against zero using ptest. We can safely do this optimization
+ // for integral vectors but not for floating-point for the reason
+ // that we have +0.0 and -0.0 and +0.0 == -0.0
+ op2 = tree->gtGetOp2();
+ if ((comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0))
+ {
+ MakeSrcContained(tree, op2);
+ }
+ else
+ {
+ // Need one SIMD register as scratch.
+ // See genSIMDIntrinsicRelOp() for details on code sequence generated and
+ // the need for one scratch register.
+ //
+ // Note these intrinsics produce a BOOL result, hence internal float
+ // registers reserved are guaranteed to be different from target
+ // integer register without explicitly specifying.
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ }
+ break;
+
+ case SIMDIntrinsicDotProduct:
+ // Float/Double vectors:
+ // For SSE, or AVX with 32-byte vectors, we also need an internal register
+ // as scratch. Further we need the targetReg and internal reg to be distinct
+ // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
+ // don't need a tmpReg.
+ //
+ // 32-byte integer vector on SSE4/AVX:
+ // will take advantage of phaddd, which operates only on 128-bit xmm reg.
+ // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
+ // registers since targetReg is an int type register.
+ //
+ // See genSIMDIntrinsicDotProduct() for details on code sequence generated
+ // and the need for scratch registers.
+ if (varTypeIsFloating(simdTree->gtSIMDBaseType))
+ {
+ if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) ||
+ (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
+ {
+ info->internalFloatCount = 1;
+ info->isInternalRegDelayFree = true;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ }
+ // else don't need scratch reg(s).
+ }
+ else
+ {
+ assert(simdTree->gtSIMDBaseType == TYP_INT && comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4);
+
+ // No need to set isInternalRegDelayFree since targetReg is a
+ // an int type reg and guaranteed to be different from xmm/ymm
+ // regs.
+ info->internalFloatCount = comp->canUseAVX() ? 2 : 1;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ }
+ info->srcCount = 2;
+ break;
+
+ case SIMDIntrinsicGetItem:
+ {
+ // This implements get_Item method. The sources are:
+ // - the source SIMD struct
+ // - index (which element to get)
+ // The result is baseType of SIMD struct.
+ info->srcCount = 2;
+ op1 = tree->gtOp.gtOp1;
+ op2 = tree->gtOp.gtOp2;
+
+ // If the index is a constant, mark it as contained.
+ if (CheckImmedAndMakeContained(tree, op2))
+ {
+ info->srcCount = 1;
+ }
+
+ if (op1->isMemoryOp())
+ {
+ MakeSrcContained(tree, op1);
+
+ // Although GT_IND of TYP_SIMD12 reserves an internal float
+ // register for reading 4 and 8 bytes from memory and
+ // assembling them into target XMM reg, it is not required
+ // in this case.
+ op1->gtLsraInfo.internalIntCount = 0;
+ op1->gtLsraInfo.internalFloatCount = 0;
+ }
+ else
+ {
+ // If the index is not a constant, we will use the SIMD temp location to store the vector.
+ // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
+ // can use that in the process of extracting the element.
+ //
+ // If the index is a constant and base type is a small int we can use pextrw, but on AVX
+ // we will need a temp if are indexing into the upper half of the AVX register.
+ // In all other cases with constant index, we need a temp xmm register to extract the
+ // element if index is other than zero.
+
+ if (!op2->IsCnsIntOrI())
+ {
+ (void)comp->getSIMDInitTempVarNum();
+ }
+ else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
+ {
+ bool needFloatTemp;
+ if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
+ (comp->getSIMDInstructionSet() == InstructionSet_AVX))
+ {
+ int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
+ needFloatTemp = (byteShiftCnt >= 16);
+ }
+ else
+ {
+ needFloatTemp = !op2->IsIntegralConst(0);
+ }
+
+ if (needFloatTemp)
+ {
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ }
+ }
+ }
+ }
+ break;
+
+ case SIMDIntrinsicSetX:
+ case SIMDIntrinsicSetY:
+ case SIMDIntrinsicSetZ:
+ case SIMDIntrinsicSetW:
+ info->srcCount = 2;
+
+ // We need an internal integer register for SSE2 codegen
+ if (comp->getSIMDInstructionSet() == InstructionSet_SSE2)
+ {
+ info->internalIntCount = 1;
+ info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT));
+ }
+
+ break;
+
+ case SIMDIntrinsicCast:
+ info->srcCount = 1;
+ break;
+
+ case SIMDIntrinsicShuffleSSE2:
+ info->srcCount = 2;
+ // Second operand is an integer constant and marked as contained.
+ op2 = tree->gtOp.gtOp2;
+ noway_assert(op2->IsCnsIntOrI());
+ MakeSrcContained(tree, op2);
+ break;
+
+ case SIMDIntrinsicGetX:
+ case SIMDIntrinsicGetY:
+ case SIMDIntrinsicGetZ:
+ case SIMDIntrinsicGetW:
+ case SIMDIntrinsicGetOne:
+ case SIMDIntrinsicGetZero:
+ case SIMDIntrinsicGetCount:
+ case SIMDIntrinsicGetAllOnes:
+ assert(!"Get intrinsics should not be seen during Lowering.");
+ unreached();
+
+ default:
+ noway_assert(!"Unimplemented SIMD node type.");
+ unreached();
+ }
+}
+#endif // FEATURE_SIMD
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCast: Set the NodeInfo for a GT_CAST.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitCast(GenTree* tree)
+{
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+ // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
+ // see CodeGen::genIntToIntCast()
+
+ info->srcCount = 1;
+ info->dstCount = 1;
+
+ // Non-overflow casts to/from float/double are done using SSE2 instructions
+ // and that allow the source operand to be either a reg or memop. Given the
+ // fact that casts from small int to float/double are done as two-level casts,
+ // the source operand is always guaranteed to be of size 4 or 8 bytes.
+ var_types castToType = tree->CastToType();
+ GenTreePtr castOp = tree->gtCast.CastOp();
+ var_types castOpType = castOp->TypeGet();
+ if (tree->gtFlags & GTF_UNSIGNED)
+ {
+ castOpType = genUnsignedType(castOpType);
+ }
+
+ if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType)))
+ {
+#ifdef DEBUG
+ // If converting to float/double, the operand must be 4 or 8 byte in size.
+ if (varTypeIsFloating(castToType))
+ {
+ unsigned opSize = genTypeSize(castOpType);
+ assert(opSize == 4 || opSize == 8);
+ }
+#endif // DEBUG
+
+ // U8 -> R8 conversion requires that the operand be in a register.
+ if (castOpType != TYP_ULONG)
+ {
+ if (castOp->isMemoryOp() || castOp->IsCnsNonZeroFltOrDbl())
+ {
+ MakeSrcContained(tree, castOp);
+ }
+ else
+ {
+ // Mark castOp as reg optional to indicate codegen
+ // can still generate code if it is on stack.
+ SetRegOptional(castOp);
+ }
+ }
+ }
+
+#if !defined(_TARGET_64BIT_)
+ if (varTypeIsLong(castOpType))
+ {
+ noway_assert(castOp->OperGet() == GT_LONG);
+ info->srcCount = 2;
+ }
+#endif // !defined(_TARGET_64BIT_)
+
+ // some overflow checks need a temp reg:
+ // - GT_CAST from INT64/UINT64 to UINT32
+ if (tree->gtOverflow() && (castToType == TYP_UINT))
+ {
+ if (genTypeSize(castOpType) == 8)
+ {
+ // Here we don't need internal register to be different from targetReg,
+ // rather require it to be different from operand's reg.
+ info->internalIntCount = 1;
+ }
+ }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitGCWriteBarrier: Set the NodeInfo for a GT_STOREIND requiring a write barrier.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitGCWriteBarrier(GenTree* tree)
+{
+ assert(tree->OperGet() == GT_STOREIND);
+
+ GenTreeStoreInd* dst = tree->AsStoreInd();
+ GenTreePtr addr = dst->Addr();
+ GenTreePtr src = dst->Data();
+
+ if (addr->OperGet() == GT_LEA)
+ {
+ // In the case where we are doing a helper assignment, if the dst
+ // is an indir through an lea, we need to actually instantiate the
+ // lea in a register
+ GenTreeAddrMode* lea = addr->AsAddrMode();
+
+ int leaSrcCount = 0;
+ if (lea->HasBase())
+ {
+ leaSrcCount++;
+ }
+ if (lea->HasIndex())
+ {
+ leaSrcCount++;
+ }
+ lea->gtLsraInfo.srcCount = leaSrcCount;
+ lea->gtLsraInfo.dstCount = 1;
+ }
+
+ bool useOptimizedWriteBarrierHelper = false; // By default, assume no optimized write barriers.
+
+#if NOGC_WRITE_BARRIERS
+
+#if defined(_TARGET_X86_)
+
+ useOptimizedWriteBarrierHelper = true; // On x86, use the optimized write barriers by default.
+#ifdef DEBUG
+ GCInfo::WriteBarrierForm wbf = comp->codeGen->gcInfo.gcIsWriteBarrierCandidate(tree, src);
+ if (wbf == GCInfo::WBF_NoBarrier_CheckNotHeapInDebug) // This one is always a call to a C++ method.
+ {
+ useOptimizedWriteBarrierHelper = false;
+ }
+#endif
+
+ if (useOptimizedWriteBarrierHelper)
+ {
+ // Special write barrier:
+ // op1 (addr) goes into REG_WRITE_BARRIER (rdx) and
+ // op2 (src) goes into any int register.
+ addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER);
+ src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER_SRC);
+ }
+
+#else // !defined(_TARGET_X86_)
+#error "NOGC_WRITE_BARRIERS is not supported"
+#endif // !defined(_TARGET_X86_)
+
+#endif // NOGC_WRITE_BARRIERS
+
+ if (!useOptimizedWriteBarrierHelper)
+ {
+ // For the standard JIT Helper calls:
+ // op1 (addr) goes into REG_ARG_0 and
+ // op2 (src) goes into REG_ARG_1
+ addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0);
+ src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1);
+ }
+
+ // Both src and dst must reside in a register, which they should since we haven't set
+ // either of them as contained.
+ assert(addr->gtLsraInfo.dstCount == 1);
+ assert(src->gtLsraInfo.dstCount == 1);
+}
+
+//-----------------------------------------------------------------------------------------
+// TreeNodeInfoInitIndir: Specify register requirements for address expression of an indirection operation.
+//
+// Arguments:
+// indirTree - GT_IND or GT_STOREIND gentree node
+//
+void Lowering::TreeNodeInfoInitIndir(GenTreePtr indirTree)
+{
+ assert(indirTree->isIndir());
+ // If this is the rhs of a block copy (i.e. non-enregisterable struct),
+ // it has no register requirements.
+ if (indirTree->TypeGet() == TYP_STRUCT)
+ {
+ return;
+ }
+
+ GenTreePtr addr = indirTree->gtGetOp1();
+ TreeNodeInfo* info = &(indirTree->gtLsraInfo);
+
+ GenTreePtr base = nullptr;
+ GenTreePtr index = nullptr;
+ unsigned mul, cns;
+ bool rev;
+
+#ifdef FEATURE_SIMD
+ // If indirTree is of TYP_SIMD12, don't mark addr as contained
+ // so that it always get computed to a register. This would
+ // mean codegen side logic doesn't need to handle all possible
+ // addr expressions that could be contained.
+ //
+ // TODO-XArch-CQ: handle other addr mode expressions that could be marked
+ // as contained.
+ if (indirTree->TypeGet() == TYP_SIMD12)
+ {
+ // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
+ // To assemble the vector properly we would need an additional
+ // XMM register.
+ info->internalFloatCount = 1;
+
+ // In case of GT_IND we need an internal register different from targetReg and
+ // both of the registers are used at the same time.
+ if (indirTree->OperGet() == GT_IND)
+ {
+ info->isInternalRegDelayFree = true;
+ }
+
+ info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
+
+ return;
+ }
+#endif // FEATURE_SIMD
+
+ if ((indirTree->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0)
+ {
+ // The address of an indirection that requires its address in a reg.
+ // Skip any further processing that might otherwise make it contained.
+ }
+ else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
+ {
+ // These nodes go into an addr mode:
+ // - GT_CLS_VAR_ADDR turns into a constant.
+ // - GT_LCL_VAR_ADDR is a stack addr mode.
+
+ // make this contained, it turns into a constant that goes into an addr mode
+ MakeSrcContained(indirTree, addr);
+ }
+ else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))
+ {
+ // Amd64:
+ // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
+ // (i.e. those VSD calls for which stub addr is known during JIT compilation time). In this case,
+ // VM requires us to pass stub addr in REG_VIRTUAL_STUB_PARAM - see LowerVirtualStubCall(). For
+ // that reason we cannot mark such an addr as contained. Note that this is not an issue for
+ // indirect VSD calls since morphArgs() is explicitly materializing hidden param as a non-standard
+ // argument.
+ //
+ // Workaround:
+ // Note that LowerVirtualStubCall() sets addr->gtRegNum to REG_VIRTUAL_STUB_PARAM and Lowering::doPhase()
+ // sets destination candidates on such nodes and resets addr->gtRegNum to REG_NA before calling
+ // TreeNodeInfoInit(). Ideally we should set a flag on addr nodes that shouldn't be marked as contained
+ // (in LowerVirtualStubCall()), but we don't have any GTF_* flags left for that purpose. As a workaround
+ // an explicit check is made here.
+ //
+ // On x86, direct VSD is done via a relative branch, and in fact it MUST be contained.
+ MakeSrcContained(indirTree, addr);
+ }
+ else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr))
+ {
+ MakeSrcContained(indirTree, addr);
+ }
+ else if (addr->gtOper == GT_ARR_ELEM)
+ {
+ // The GT_ARR_ELEM consumes all the indices and produces the offset.
+ // The array object lives until the mem access.
+ // We also consume the target register to which the address is
+ // computed
+
+ info->srcCount++;
+ assert(addr->gtLsraInfo.srcCount >= 2);
+ addr->gtLsraInfo.srcCount -= 1;
+ }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCmp: Set the register requirements for a compare.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree)
+{
+ assert(tree->OperIsCompare());
+
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+ info->srcCount = 2;
+ info->dstCount = 1;
+
+#ifdef _TARGET_X86_
+ // If the compare is used by a jump, we just need to set the condition codes. If not, then we need
+ // to store the result into the low byte of a register, which requires the dst be a byteable register.
+ // We always set the dst candidates, though, because if this is compare is consumed by a jump, they
+ // won't be used. We might be able to use GTF_RELOP_JMP_USED to determine this case, but it's not clear
+ // that flag is maintained until this location (especially for decomposed long compares).
+ info->setDstCandidates(m_lsra, RBM_BYTE_REGS);
+#endif // _TARGET_X86_
+
+ GenTreePtr op1 = tree->gtOp.gtOp1;
+ GenTreePtr op2 = tree->gtOp.gtOp2;
+ var_types op1Type = op1->TypeGet();
+ var_types op2Type = op2->TypeGet();
+
+#if !defined(_TARGET_64BIT_)
+ // Long compares will consume GT_LONG nodes, each of which produces two results.
+ // Thus for each long operand there will be an additional source.
+ // TODO-X86-CQ: Mark hiOp2 and loOp2 as contained if it is a constant or a memory op.
+ if (varTypeIsLong(op1Type))
+ {
+ info->srcCount++;
+ }
+ if (varTypeIsLong(op2Type))
+ {
+ info->srcCount++;
+ }
+#endif // !defined(_TARGET_64BIT_)
+
+ // If either of op1 or op2 is floating point values, then we need to use
+ // ucomiss or ucomisd to compare, both of which support the following form:
+ // ucomis[s|d] xmm, xmm/mem
+ // That is only the second operand can be a memory op.
+ //
+ // Second operand is a memory Op: Note that depending on comparison operator,
+ // the operands of ucomis[s|d] need to be reversed. Therefore, either op1 or
+ // op2 can be a memory op depending on the comparison operator.
+ if (varTypeIsFloating(op1Type))
+ {
+ // The type of the operands has to be the same and no implicit conversions at this stage.
+ assert(op1Type == op2Type);
+
+ bool reverseOps;
+ if ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0)
+ {
+ // Unordered comparison case
+ reverseOps = tree->OperIs(GT_GT, GT_GE);
+ }
+ else
+ {
+ reverseOps = tree->OperIs(GT_LT, GT_LE);
+ }
+
+ GenTreePtr otherOp;
+ if (reverseOps)
+ {
+ otherOp = op1;
+ }
+ else
+ {
+ otherOp = op2;
+ }
+
+ assert(otherOp != nullptr);
+ if (otherOp->IsCnsNonZeroFltOrDbl())
+ {
+ MakeSrcContained(tree, otherOp);
+ }
+ else if (otherOp->isMemoryOp() && ((otherOp == op2) || IsSafeToContainMem(tree, otherOp)))
+ {
+ MakeSrcContained(tree, otherOp);
+ }
+ else
+ {
+ // SSE2 allows only otherOp to be a memory-op. Since otherOp is not
+ // contained, we can mark it reg-optional.
+ SetRegOptional(otherOp);
+ }
+
+ return;
+ }
+
+ // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
+ // or in other backend.
+
+ if (CheckImmedAndMakeContained(tree, op2))
+ {
+ // If the types are the same, or if the constant is of the correct size,
+ // we can treat the isMemoryOp as contained.
+ if (op1Type == op2Type)
+ {
+ if (op1->isMemoryOp())
+ {
+ MakeSrcContained(tree, op1);
+ }
+ // If op1 codegen sets ZF and SF flags and ==/!= against
+ // zero, we don't need to generate test instruction,
+ // provided we don't have another GenTree node between op1
+ // and tree that could potentially modify flags.
+ //
+ // TODO-CQ: right now the below peep is inexpensive and
+ // gets the benefit in most of cases because in majority
+ // of cases op1, op2 and tree would be in that order in
+ // execution. In general we should be able to check that all
+ // the nodes that come after op1 in execution order do not
+ // modify the flags so that it is safe to avoid generating a
+ // test instruction. Such a check requires that on each
+ // GenTree node we need to set the info whether its codegen
+ // will modify flags.
+ //
+ // TODO-CQ: We can optimize compare against zero in the
+ // following cases by generating the branch as indicated
+ // against each case.
+ // 1) unsigned compare
+ // < 0 - always FALSE
+ // <= 0 - ZF=1 and jne
+ // > 0 - ZF=0 and je
+ // >= 0 - always TRUE
+ //
+ // 2) signed compare
+ // < 0 - SF=1 and js
+ // >= 0 - SF=0 and jns
+ else if (tree->OperIs(GT_EQ, GT_NE) && op1->gtSetZSFlags() && op2->IsIntegralConst(0) &&
+ (op1->gtNext == op2) && (op2->gtNext == tree))
+ {
+ // Require codegen of op1 to set the flags.
+ assert(!op1->gtSetFlags());
+ op1->gtFlags |= GTF_SET_FLAGS;
+ }
+ else
+ {
+ SetRegOptional(op1);
+ }
+ }
+ }
+ else if (op1Type == op2Type)
+ {
+ // Note that TEST does not have a r,rm encoding like CMP has but we can still
+ // contain the second operand because the emitter maps both r,rm and rm,r to
+ // the same instruction code. This avoids the need to special case TEST here.
+ if (op2->isMemoryOp())
+ {
+ MakeSrcContained(tree, op2);
+ }
+ else if (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))
+ {
+ MakeSrcContained(tree, op1);
+ }
+ else if (op1->IsCnsIntOrI())
+ {
+ // TODO-CQ: We should be able to support swapping op1 and op2 to generate cmp reg, imm,
+ // but there is currently an assert in CodeGen::genCompareInt().
+ // https://github.com/dotnet/coreclr/issues/7270
+ SetRegOptional(op2);
+ }
+ else
+ {
+ // One of op1 or op2 could be marked as reg optional
+ // to indicate that codegen can still generate code
+ // if one of them is on stack.
+ SetRegOptional(PreferredRegOptionalOperand(tree));
+ }
+ }
+}
+
+//--------------------------------------------------------------------------------------------
+// TreeNodeInfoInitIfRMWMemOp: Checks to see if there is a RMW memory operation rooted at
+// GT_STOREIND node and if so will mark register requirements for nodes under storeInd so
+// that CodeGen will generate a single instruction of the form:
+//
+// binOp [addressing mode], reg
+//
+// Parameters
+// storeInd - GT_STOREIND node
+//
+// Return value
+// True, if RMW memory op tree pattern is recognized and op counts are set.
+// False otherwise.
+//
+bool Lowering::TreeNodeInfoInitIfRMWMemOp(GenTreePtr storeInd)
+{
+ assert(storeInd->OperGet() == GT_STOREIND);
+
+ // SSE2 doesn't support RMW on float values
+ assert(!varTypeIsFloating(storeInd));
+
+ // Terminology:
+ // indirDst = memory write of an addr mode (i.e. storeind destination)
+ // indirSrc = value being written to memory (i.e. storeind source which could a binary/unary op)
+ // indirCandidate = memory read i.e. a gtInd of an addr mode
+ // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
+
+ GenTreePtr indirCandidate = nullptr;
+ GenTreePtr indirOpSource = nullptr;
+
+ if (!IsRMWMemOpRootedAtStoreInd(storeInd, &indirCandidate, &indirOpSource))
+ {
+ JITDUMP("Lower of StoreInd didn't mark the node as self contained for reason: %d\n",
+ storeInd->AsStoreInd()->GetRMWStatus());
+ DISPTREERANGE(BlockRange(), storeInd);
+ return false;
+ }
+
+ GenTreePtr indirDst = storeInd->gtGetOp1();
+ GenTreePtr indirSrc = storeInd->gtGetOp2();
+ genTreeOps oper = indirSrc->OperGet();
+
+ // At this point we have successfully detected a RMW memory op of one of the following forms
+ // storeInd(indirDst, indirSrc(indirCandidate, indirOpSource)) OR
+ // storeInd(indirDst, indirSrc(indirOpSource, indirCandidate) in case of commutative operations OR
+ // storeInd(indirDst, indirSrc(indirCandidate) in case of unary operations
+ //
+ // Here indirSrc = one of the supported binary or unary operation for RMW of memory
+ // indirCandidate = a GT_IND node
+ // indirCandidateChild = operand of GT_IND indirCandidate
+ //
+ // The logic below essentially does the following
+ // Make indirOpSource contained.
+ // Make indirSrc contained.
+ // Make indirCandidate contained.
+ // Make indirCandidateChild contained.
+ // Make indirDst contained except when it is a GT_LCL_VAR or GT_CNS_INT that doesn't fit within addr
+ // base.
+ // Note that due to the way containment is supported, we accomplish some of the above by clearing operand counts
+ // and directly propagating them upward.
+ //
+
+ TreeNodeInfo* info = &(storeInd->gtLsraInfo);
+ info->dstCount = 0;
+
+ if (GenTree::OperIsBinary(oper))
+ {
+ // On Xarch RMW operations require that the source memory-op be in a register.
+ assert(!indirOpSource->isMemoryOp() || indirOpSource->gtLsraInfo.dstCount == 1);
+ JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n");
+ info->srcCount = indirOpSource->gtLsraInfo.dstCount;
+ }
+ else
+ {
+ assert(GenTree::OperIsUnary(oper));
+ JITDUMP("Lower succesfully detected an assignment of the form: *addrMode = UnaryOp(*addrMode)\n");
+ info->srcCount = 0;
+ }
+ DISPTREERANGE(BlockRange(), storeInd);
+
+ m_lsra->clearOperandCounts(indirSrc);
+ m_lsra->clearOperandCounts(indirCandidate);
+
+ GenTreePtr indirCandidateChild = indirCandidate->gtGetOp1();
+ if (indirCandidateChild->OperGet() == GT_LEA)
+ {
+ GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode();
+
+ if (addrMode->HasBase())
+ {
+ assert(addrMode->Base()->OperIsLeaf());
+ m_lsra->clearOperandCounts(addrMode->Base());
+ info->srcCount++;
+ }
+
+ if (addrMode->HasIndex())
+ {
+ assert(addrMode->Index()->OperIsLeaf());
+ m_lsra->clearOperandCounts(addrMode->Index());
+ info->srcCount++;
+ }
+
+ m_lsra->clearOperandCounts(indirDst);
+ }
+ else
+ {
+ assert(indirCandidateChild->OperGet() == GT_LCL_VAR || indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR ||
+ indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR || indirCandidateChild->OperGet() == GT_CNS_INT);
+
+ // If it is a GT_LCL_VAR, it still needs the reg to hold the address.
+ // We would still need a reg for GT_CNS_INT if it doesn't fit within addressing mode base.
+ // For GT_CLS_VAR_ADDR, we don't need a reg to hold the address, because field address value is known at jit
+ // time. Also, we don't need a reg for GT_CLS_VAR_ADDR.
+ if (indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR || indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR)
+ {
+ m_lsra->clearOperandCounts(indirDst);
+ }
+ else if (indirCandidateChild->IsCnsIntOrI() && indirCandidateChild->AsIntConCommon()->FitsInAddrBase(comp))
+ {
+ m_lsra->clearOperandCounts(indirDst);
+ }
+ else
+ {
+ // Need a reg and hence increment src count of storeind
+ info->srcCount += indirCandidateChild->gtLsraInfo.dstCount;
+ }
+ }
+ m_lsra->clearOperandCounts(indirCandidateChild);
+
+#ifdef _TARGET_X86_
+ if (varTypeIsByte(storeInd))
+ {
+ // If storeInd is of TYP_BYTE, set indirOpSources to byteable registers.
+ bool containedNode = indirOpSource->gtLsraInfo.dstCount == 0;
+ if (!containedNode)
+ {
+ regMaskTP regMask = indirOpSource->gtLsraInfo.getSrcCandidates(m_lsra);
+ assert(regMask != RBM_NONE);
+ indirOpSource->gtLsraInfo.setSrcCandidates(m_lsra, regMask & ~RBM_NON_BYTE_REGS);
+ }
+ }
+#endif
+
+ return true;
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitMul: Set the NodeInfo for a multiply.
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// None.
+//
+void Lowering::TreeNodeInfoInitMul(GenTreePtr tree)
+{
+#if defined(_TARGET_X86_)
+ assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI || tree->OperGet() == GT_MUL_LONG);
+#else
+ assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI);
+#endif
+ TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+ info->srcCount = 2;
+ info->dstCount = 1;
+
+ GenTreePtr op1 = tree->gtOp.gtOp1;
+ GenTreePtr op2 = tree->gtOp.gtOp2;
+
+ // Case of float/double mul.
+ if (varTypeIsFloating(tree->TypeGet()))
+ {
+ assert(tree->OperGet() == GT_MUL);
+
+ if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
+ {
+ MakeSrcContained(tree, op2);
+ }
+ else if (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1)))
+ {
+ // Since GT_MUL is commutative, we will try to re-order operands if it is safe to
+ // generate more efficient code sequence for the case of GT_MUL(op1=memOp, op2=non-memOp)
+ MakeSrcContained(tree, op1);
+ }
+ else
+ {
+ // If there are no containable operands, we can make an operand reg optional.
+ SetRegOptionalForBinOp(tree);
+ }
+ return;
+ }
+
+ bool isUnsignedMultiply = ((tree->gtFlags & GTF_UNSIGNED) != 0);
+ bool requiresOverflowCheck = tree->gtOverflowEx();
+ bool useLeaEncoding = false;
+ GenTreePtr memOp = nullptr;
+
+ bool hasImpliedFirstOperand = false;
+ GenTreeIntConCommon* imm = nullptr;
+ GenTreePtr other = nullptr;
+
+ // There are three forms of x86 multiply:
+ // one-op form: RDX:RAX = RAX * r/m
+ // two-op form: reg *= r/m
+ // three-op form: reg = r/m * imm
+
+ // This special widening 32x32->64 MUL is not used on x64
+ CLANG_FORMAT_COMMENT_ANCHOR;
+#if defined(_TARGET_X86_)
+ if (tree->OperGet() != GT_MUL_LONG)
+#endif
+ {
+ assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
+ }
+
+ // Multiply should never be using small types
+ assert(!varTypeIsSmall(tree->TypeGet()));
+
+ // We do use the widening multiply to implement
+ // the overflow checking for unsigned multiply
+ //
+ if (isUnsignedMultiply && requiresOverflowCheck)
+ {
+ // The only encoding provided is RDX:RAX = RAX * rm
+ //
+ // Here we set RAX as the only destination candidate
+ // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
+ //
+ info->setDstCandidates(m_lsra, RBM_RAX);
+ hasImpliedFirstOperand = true;
+ }
+ else if (tree->OperGet() == GT_MULHI)
+ {
+ // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
+ // upper 32 bits of the result set the destination candidate to REG_RDX.
+ info->setDstCandidates(m_lsra, RBM_RDX);
+ hasImpliedFirstOperand = true;
+ }
+#if defined(_TARGET_X86_)
+ else if (tree->OperGet() == GT_MUL_LONG)
+ {
+ // have to use the encoding:RDX:RAX = RAX * rm
+ info->setDstCandidates(m_lsra, RBM_RAX);
+ hasImpliedFirstOperand = true;
+ }
+#endif
+ else if (IsContainableImmed(tree, op2) || IsContainableImmed(tree, op1))
+ {
+ if (IsContainableImmed(tree, op2))
+ {
+ imm = op2->AsIntConCommon();
+ other = op1;
+ }
+ else
+ {
+ imm = op1->AsIntConCommon();
+ other = op2;
+ }
+
+ // CQ: We want to rewrite this into a LEA
+ ssize_t immVal = imm->AsIntConCommon()->IconValue();
+ if (!requiresOverflowCheck && (immVal == 3 || immVal == 5 || immVal == 9))
+ {
+ useLeaEncoding = true;
+ }
+
+ MakeSrcContained(tree, imm); // The imm is always contained
+ if (other->isMemoryOp())
+ {
+ memOp = other; // memOp may be contained below
+ }
+ }
+
+ // We allow one operand to be a contained memory operand.
+ // The memory op type must match with the 'tree' type.
+ // This is because during codegen we use 'tree' type to derive EmitTypeSize.
+ // E.g op1 type = byte, op2 type = byte but GT_MUL tree type is int.
+ //
+ if (memOp == nullptr && op2->isMemoryOp())
+ {
+ memOp = op2;
+ }
+
+ // To generate an LEA we need to force memOp into a register
+ // so don't allow memOp to be 'contained'
+ //
+ if (!useLeaEncoding)
+ {
+ if ((memOp != nullptr) && (memOp->TypeGet() == tree->TypeGet()) && IsSafeToContainMem(tree, memOp))
+ {
+ MakeSrcContained(tree, memOp);
+ }
+ else if (imm != nullptr)
+ {
+ // Has a contained immediate operand.
+ // Only 'other' operand can be marked as reg optional.
+ assert(other != nullptr);
+ SetRegOptional(other);
+ }
+ else if (hasImpliedFirstOperand)
+ {
+ // Only op2 can be marke as reg optional.
+ SetRegOptional(op2);
+ }
+ else
+ {
+ // If there are no containable operands, we can make either of op1 or op2
+ // as reg optional.
+ SetRegOptionalForBinOp(tree);
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
+// Contains256bitAVX flag when SIMD vector size is 32 bytes
+//
+// Arguments:
+// isFloatingPointType - true if it is floating point type
+// sizeOfSIMDVector - SIMD Vector size
+//
+void Lowering::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
+{
+#ifdef FEATURE_AVX_SUPPORT
+ if (isFloatingPointType)
+ {
+ if (comp->getFloatingPointInstructionSet() == InstructionSet_AVX)
+ {
+ comp->getEmitter()->SetContainsAVX(true);
+ }
+ if (sizeOfSIMDVector == 32 && comp->getSIMDInstructionSet() == InstructionSet_AVX)
+ {
+ comp->getEmitter()->SetContains256bitAVX(true);
+ }
+ }
+#endif
+}
+
+#ifdef _TARGET_X86_
+//------------------------------------------------------------------------
+// ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for
+// various reasons
+//
+// Arguments:
+// tree - The node of interest
+//
+// Return Value:
+// If we need to exclude non-byteable registers
+//
+bool Lowering::ExcludeNonByteableRegisters(GenTree* tree)
+{
+ // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
+ // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
+ // value. In this case we need to exclude esi/edi from the src candidates of op2.
+ if (varTypeIsByte(tree))
+ {
+ return true;
+ }
+ // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
+ else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType()))
+ {
+ return true;
+ }
+ else if (tree->OperIsCompare())
+ {
+ GenTree* op1 = tree->gtGetOp1();
+ GenTree* op2 = tree->gtGetOp2();
+
+ // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
+ // ubyte as the result of comparison and if the result needs to be materialized into a reg
+ // simply zero extend it to TYP_INT size. Here is an example of generated code:
+ // cmp dl, byte ptr[addr mode]
+ // movzx edx, dl
+ if (varTypeIsByte(op1) && varTypeIsByte(op2))
+ {
+ return true;
+ }
+ // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses
+ // ubyte as the result of the comparison and if the result needs to be materialized into a reg
+ // simply zero extend it to TYP_INT size.
+ else if (varTypeIsByte(op1) && op2->IsCnsIntOrI())
+ {
+ return true;
+ }
+ // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses
+ // ubyte as the result of the comparison and if the result needs to be materialized into a reg
+ // simply zero extend it to TYP_INT size.
+ else if (op1->IsCnsIntOrI() && varTypeIsByte(op2))
+ {
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+#ifdef FEATURE_SIMD
+ else if (tree->OperGet() == GT_SIMD)
+ {
+ GenTreeSIMD* simdNode = tree->AsSIMD();
+ switch (simdNode->gtSIMDIntrinsicID)
+ {
+ case SIMDIntrinsicOpEquality:
+ case SIMDIntrinsicOpInEquality:
+ // We manifest it into a byte register, so the target must be byteable.
+ return true;
+
+ case SIMDIntrinsicGetItem:
+ {
+ // This logic is duplicated from genSIMDIntrinsicGetItem().
+ // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
+ // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
+ // cases will require this, so the non-byteable registers can be excluded.
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ GenTree* op2 = simdNode->gtGetOp2();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ if (!op1->isMemoryOp() && op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
+ {
+ bool ZeroOrSignExtnReqd = true;
+ unsigned baseSize = genTypeSize(baseType);
+ if (baseSize == 1)
+ {
+ if ((op2->gtIntCon.gtIconVal % 2) == 1)
+ {
+ ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
+ }
+ }
+ else
+ {
+ assert(baseSize == 2);
+ ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
+ }
+ return ZeroOrSignExtnReqd;
+ }
+ break;
+ }
+
+ default:
+ break;
+ }
+ return false;
+ }
+#endif // FEATURE_SIMD
+ else
+ {
+ return false;
+ }
+}
+#endif // _TARGET_X86_
+
+#endif // _TARGET_XARCH_
+
+#endif // !LEGACY_BACKEND