summaryrefslogtreecommitdiff
path: root/src/jit/codegenxarch.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/jit/codegenxarch.cpp')
-rw-r--r--src/jit/codegenxarch.cpp9388
1 files changed, 9388 insertions, 0 deletions
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
new file mode 100644
index 0000000000..a41c28695b
--- /dev/null
+++ b/src/jit/codegenxarch.cpp
@@ -0,0 +1,9388 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX XX
+XX Amd64/x86 Code Generator XX
+XX XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator.
+
+#ifdef _TARGET_XARCH_
+#include "emit.h"
+#include "codegen.h"
+#include "lower.h"
+#include "gcinfo.h"
+#include "gcinfoencoder.h"
+
+// Get the register assigned to the given node
+
+regNumber CodeGenInterface::genGetAssignedReg(GenTreePtr tree)
+{
+ return tree->gtRegNum;
+}
+
+//------------------------------------------------------------------------
+// genSpillVar: Spill a local variable
+//
+// Arguments:
+// tree - the lclVar node for the variable being spilled
+//
+// Return Value:
+// None.
+//
+// Assumptions:
+// The lclVar must be a register candidate (lvRegCandidate)
+
+void CodeGen::genSpillVar(GenTreePtr tree)
+{
+ unsigned varNum = tree->gtLclVarCommon.gtLclNum;
+ LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
+
+ assert(varDsc->lvIsRegCandidate());
+
+ // We don't actually need to spill if it is already living in memory
+ bool needsSpill = ((tree->gtFlags & GTF_VAR_DEF) == 0 && varDsc->lvIsInReg());
+ if (needsSpill)
+ {
+ var_types lclTyp = varDsc->TypeGet();
+ if (varDsc->lvNormalizeOnStore())
+ {
+ lclTyp = genActualType(lclTyp);
+ }
+ emitAttr size = emitTypeSize(lclTyp);
+
+ bool restoreRegVar = false;
+ if (tree->gtOper == GT_REG_VAR)
+ {
+ tree->SetOper(GT_LCL_VAR);
+ restoreRegVar = true;
+ }
+
+ // mask off the flag to generate the right spill code, then bring it back
+ tree->gtFlags &= ~GTF_REG_VAL;
+
+ instruction storeIns = ins_Store(tree->TypeGet(), compiler->isSIMDTypeLocalAligned(varNum));
+#if CPU_LONG_USES_REGPAIR
+ if (varTypeIsMultiReg(tree))
+ {
+ assert(varDsc->lvRegNum == genRegPairLo(tree->gtRegPair));
+ assert(varDsc->lvOtherReg == genRegPairHi(tree->gtRegPair));
+ regNumber regLo = genRegPairLo(tree->gtRegPair);
+ regNumber regHi = genRegPairHi(tree->gtRegPair);
+ inst_TT_RV(storeIns, tree, regLo);
+ inst_TT_RV(storeIns, tree, regHi, 4);
+ }
+ else
+#endif
+ {
+ assert(varDsc->lvRegNum == tree->gtRegNum);
+ inst_TT_RV(storeIns, tree, tree->gtRegNum, 0, size);
+ }
+ tree->gtFlags |= GTF_REG_VAL;
+
+ if (restoreRegVar)
+ {
+ tree->SetOper(GT_REG_VAR);
+ }
+
+ genUpdateRegLife(varDsc, /*isBorn*/ false, /*isDying*/ true DEBUGARG(tree));
+ gcInfo.gcMarkRegSetNpt(varDsc->lvRegMask());
+
+ if (VarSetOps::IsMember(compiler, gcInfo.gcTrkStkPtrLcls, varDsc->lvVarIndex))
+ {
+#ifdef DEBUG
+ if (!VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
+ {
+ JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming live\n", varNum);
+ }
+ else
+ {
+ JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing live\n", varNum);
+ }
+#endif
+ VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+ }
+ }
+
+ tree->gtFlags &= ~GTF_SPILL;
+ varDsc->lvRegNum = REG_STK;
+ if (varTypeIsMultiReg(tree))
+ {
+ varDsc->lvOtherReg = REG_STK;
+ }
+}
+
+// inline
+void CodeGenInterface::genUpdateVarReg(LclVarDsc* varDsc, GenTreePtr tree)
+{
+ assert(tree->OperIsScalarLocal() || (tree->gtOper == GT_COPY));
+ varDsc->lvRegNum = tree->gtRegNum;
+}
+
+/*****************************************************************************/
+/*****************************************************************************/
+
+/*****************************************************************************
+ *
+ * Generate code that will set the given register to the integer constant.
+ */
+
+void CodeGen::genSetRegToIcon(regNumber reg, ssize_t val, var_types type, insFlags flags)
+{
+ // Reg cannot be a FP reg
+ assert(!genIsValidFloatReg(reg));
+
+ // The only TYP_REF constant that can come this path is a managed 'null' since it is not
+ // relocatable. Other ref type constants (e.g. string objects) go through a different
+ // code path.
+ noway_assert(type != TYP_REF || val == 0);
+
+ if (val == 0)
+ {
+ instGen_Set_Reg_To_Zero(emitActualTypeSize(type), reg, flags);
+ }
+ else
+ {
+ // TODO-XArch-CQ: needs all the optimized cases
+ getEmitter()->emitIns_R_I(INS_mov, emitActualTypeSize(type), reg, val);
+ }
+}
+
+/*****************************************************************************
+ *
+ * Generate code to check that the GS cookie wasn't thrashed by a buffer
+ * overrun. If pushReg is true, preserve all registers around code sequence.
+ * Otherwise ECX could be modified.
+ *
+ * Implementation Note: pushReg = true, in case of tail calls.
+ */
+void CodeGen::genEmitGSCookieCheck(bool pushReg)
+{
+ noway_assert(compiler->gsGlobalSecurityCookieAddr || compiler->gsGlobalSecurityCookieVal);
+
+ // Make sure that EAX is reported as live GC-ref so that any GC that kicks in while
+ // executing GS cookie check will not collect the object pointed to by EAX.
+ //
+ // For Amd64 System V, a two-register-returned struct could be returned in RAX and RDX
+ // In such case make sure that the correct GC-ness of RDX is reported as well, so
+ // a GC object pointed by RDX will not be collected.
+ if (!pushReg)
+ {
+ // Handle multi-reg return type values
+ if (compiler->compMethodReturnsMultiRegRetType())
+ {
+ ReturnTypeDesc retTypeDesc;
+ if (varTypeIsLong(compiler->info.compRetNativeType))
+ {
+ retTypeDesc.InitializeLongReturnType(compiler);
+ }
+ else // we must have a struct return type
+ {
+ retTypeDesc.InitializeStructReturnType(compiler, compiler->info.compMethodInfo->args.retTypeClass);
+ }
+
+ unsigned regCount = retTypeDesc.GetReturnRegCount();
+
+ // Only x86 and x64 Unix ABI allows multi-reg return and
+ // number of result regs should be equal to MAX_RET_REG_COUNT.
+ assert(regCount == MAX_RET_REG_COUNT);
+
+ for (unsigned i = 0; i < regCount; ++i)
+ {
+ gcInfo.gcMarkRegPtrVal(retTypeDesc.GetABIReturnReg(i), retTypeDesc.GetReturnRegType(i));
+ }
+ }
+ else if (compiler->compMethodReturnsRetBufAddr())
+ {
+ // This is for returning in an implicit RetBuf.
+ // If the address of the buffer is returned in REG_INTRET, mark the content of INTRET as ByRef.
+
+ // In case the return is in an implicit RetBuf, the native return type should be a struct
+ assert(varTypeIsStruct(compiler->info.compRetNativeType));
+
+ gcInfo.gcMarkRegPtrVal(REG_INTRET, TYP_BYREF);
+ }
+ // ... all other cases.
+ else
+ {
+#ifdef _TARGET_AMD64_
+ // For x64, structs that are not returned in registers are always
+ // returned in implicit RetBuf. If we reached here, we should not have
+ // a RetBuf and the return type should not be a struct.
+ assert(compiler->info.compRetBuffArg == BAD_VAR_NUM);
+ assert(!varTypeIsStruct(compiler->info.compRetNativeType));
+#endif // _TARGET_AMD64_
+
+ // For x86 Windows we can't make such assertions since we generate code for returning of
+ // the RetBuf in REG_INTRET only when the ProfilerHook is enabled. Otherwise
+ // compRetNativeType could be TYP_STRUCT.
+ gcInfo.gcMarkRegPtrVal(REG_INTRET, compiler->info.compRetNativeType);
+ }
+ }
+
+ regNumber regGSCheck;
+ if (!pushReg)
+ {
+ // Non-tail call: we can use any callee trash register that is not
+ // a return register or contain 'this' pointer (keep alive this), since
+ // we are generating GS cookie check after a GT_RETURN block.
+ // Note: On Amd64 System V RDX is an arg register - REG_ARG_2 - as well
+ // as return register for two-register-returned structs.
+ if (compiler->lvaKeepAliveAndReportThis() && compiler->lvaTable[compiler->info.compThisArg].lvRegister &&
+ (compiler->lvaTable[compiler->info.compThisArg].lvRegNum == REG_ARG_0))
+ {
+ regGSCheck = REG_ARG_1;
+ }
+ else
+ {
+ regGSCheck = REG_ARG_0;
+ }
+ }
+ else
+ {
+#ifdef _TARGET_X86_
+ NYI_X86("Tail calls from methods that need GS check");
+ regGSCheck = REG_NA;
+#else // !_TARGET_X86_
+ // Tail calls from methods that need GS check: We need to preserve registers while
+ // emitting GS cookie check for a tail prefixed call or a jmp. To emit GS cookie
+ // check, we might need a register. This won't be an issue for jmp calls for the
+ // reason mentioned below (see comment starting with "Jmp Calls:").
+ //
+ // The following are the possible solutions in case of tail prefixed calls:
+ // 1) Use R11 - ignore tail prefix on calls that need to pass a param in R11 when
+ // present in methods that require GS cookie check. Rest of the tail calls that
+ // do not require R11 will be honored.
+ // 2) Internal register - GT_CALL node reserves an internal register and emits GS
+ // cookie check as part of tail call codegen. GenExitCode() needs to special case
+ // fast tail calls implemented as epilog+jmp or such tail calls should always get
+ // dispatched via helper.
+ // 3) Materialize GS cookie check as a sperate node hanging off GT_CALL node in
+ // right execution order during rationalization.
+ //
+ // There are two calls that use R11: VSD and calli pinvokes with cookie param. Tail
+ // prefix on pinvokes is ignored. That is, options 2 and 3 will allow tail prefixed
+ // VSD calls from methods that need GS check.
+ //
+ // Tail prefixed calls: Right now for Jit64 compat, method requiring GS cookie check
+ // ignores tail prefix. In future, if we intend to support tail calls from such a method,
+ // consider one of the options mentioned above. For now adding an assert that we don't
+ // expect to see a tail call in a method that requires GS check.
+ noway_assert(!compiler->compTailCallUsed);
+
+ // Jmp calls: specify method handle using which JIT queries VM for its entry point
+ // address and hence it can neither be a VSD call nor PInvoke calli with cookie
+ // parameter. Therefore, in case of jmp calls it is safe to use R11.
+ regGSCheck = REG_R11;
+#endif // !_TARGET_X86_
+ }
+
+ if (compiler->gsGlobalSecurityCookieAddr == nullptr)
+ {
+ // If GS cookie value fits within 32-bits we can use 'cmp mem64, imm32'.
+ // Otherwise, load the value into a reg and use 'cmp mem64, reg64'.
+ if ((int)compiler->gsGlobalSecurityCookieVal != (ssize_t)compiler->gsGlobalSecurityCookieVal)
+ {
+ genSetRegToIcon(regGSCheck, compiler->gsGlobalSecurityCookieVal, TYP_I_IMPL);
+ getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
+ }
+ else
+ {
+ getEmitter()->emitIns_S_I(INS_cmp, EA_PTRSIZE, compiler->lvaGSSecurityCookie, 0,
+ (int)compiler->gsGlobalSecurityCookieVal);
+ }
+ }
+ else
+ {
+ // Ngen case - GS cookie value needs to be accessed through an indirection.
+ instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, regGSCheck, (ssize_t)compiler->gsGlobalSecurityCookieAddr);
+ getEmitter()->emitIns_R_AR(ins_Load(TYP_I_IMPL), EA_PTRSIZE, regGSCheck, regGSCheck, 0);
+ getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
+ }
+
+ BasicBlock* gsCheckBlk = genCreateTempLabel();
+ emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
+ inst_JMP(jmpEqual, gsCheckBlk);
+ genEmitHelperCall(CORINFO_HELP_FAIL_FAST, 0, EA_UNKNOWN);
+ genDefineTempLabel(gsCheckBlk);
+}
+
+/*****************************************************************************
+ *
+ * Generate code for all the basic blocks in the function.
+ */
+
+void CodeGen::genCodeForBBlist()
+{
+ unsigned varNum;
+ LclVarDsc* varDsc;
+
+ unsigned savedStkLvl;
+
+#ifdef DEBUG
+ genInterruptibleUsed = true;
+
+ // You have to be careful if you create basic blocks from now on
+ compiler->fgSafeBasicBlockCreation = false;
+
+ // This stress mode is not comptible with fully interruptible GC
+ if (genInterruptible && compiler->opts.compStackCheckOnCall)
+ {
+ compiler->opts.compStackCheckOnCall = false;
+ }
+
+ // This stress mode is not comptible with fully interruptible GC
+ if (genInterruptible && compiler->opts.compStackCheckOnRet)
+ {
+ compiler->opts.compStackCheckOnRet = false;
+ }
+#endif // DEBUG
+
+ // Prepare the blocks for exception handling codegen: mark the blocks that needs labels.
+ genPrepForEHCodegen();
+
+ assert(!compiler->fgFirstBBScratch ||
+ compiler->fgFirstBB == compiler->fgFirstBBScratch); // compiler->fgFirstBBScratch has to be first.
+
+ /* Initialize the spill tracking logic */
+
+ regSet.rsSpillBeg();
+
+#ifdef DEBUGGING_SUPPORT
+ /* Initialize the line# tracking logic */
+
+ if (compiler->opts.compScopeInfo)
+ {
+ siInit();
+ }
+#endif
+
+ // The current implementation of switch tables requires the first block to have a label so it
+ // can generate offsets to the switch label targets.
+ // TODO-XArch-CQ: remove this when switches have been re-implemented to not use this.
+ if (compiler->fgHasSwitch)
+ {
+ compiler->fgFirstBB->bbFlags |= BBF_JMP_TARGET;
+ }
+
+ genPendingCallLabel = nullptr;
+
+ /* Initialize the pointer tracking code */
+
+ gcInfo.gcRegPtrSetInit();
+ gcInfo.gcVarPtrSetInit();
+
+ /* If any arguments live in registers, mark those regs as such */
+
+ for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->lvaCount; varNum++, varDsc++)
+ {
+ /* Is this variable a parameter assigned to a register? */
+
+ if (!varDsc->lvIsParam || !varDsc->lvRegister)
+ {
+ continue;
+ }
+
+ /* Is the argument live on entry to the method? */
+
+ if (!VarSetOps::IsMember(compiler, compiler->fgFirstBB->bbLiveIn, varDsc->lvVarIndex))
+ {
+ continue;
+ }
+
+ /* Is this a floating-point argument? */
+
+ if (varDsc->IsFloatRegType())
+ {
+ continue;
+ }
+
+ noway_assert(!varTypeIsFloating(varDsc->TypeGet()));
+
+ /* Mark the register as holding the variable */
+
+ regTracker.rsTrackRegLclVar(varDsc->lvRegNum, varNum);
+ }
+
+ unsigned finallyNesting = 0;
+
+ // Make sure a set is allocated for compiler->compCurLife (in the long case), so we can set it to empty without
+ // allocation at the start of each basic block.
+ VarSetOps::AssignNoCopy(compiler, compiler->compCurLife, VarSetOps::MakeEmpty(compiler));
+
+ /*-------------------------------------------------------------------------
+ *
+ * Walk the basic blocks and generate code for each one
+ *
+ */
+
+ BasicBlock* block;
+ BasicBlock* lblk; /* previous block */
+
+ for (lblk = nullptr, block = compiler->fgFirstBB; block != nullptr; lblk = block, block = block->bbNext)
+ {
+#ifdef DEBUG
+ if (compiler->verbose)
+ {
+ printf("\n=============== Generating ");
+ block->dspBlockHeader(compiler, true, true);
+ compiler->fgDispBBLiveness(block);
+ }
+#endif // DEBUG
+
+ // Figure out which registers hold variables on entry to this block
+
+ regSet.ClearMaskVars();
+ gcInfo.gcRegGCrefSetCur = RBM_NONE;
+ gcInfo.gcRegByrefSetCur = RBM_NONE;
+
+ compiler->m_pLinearScan->recordVarLocationsAtStartOfBB(block);
+
+ genUpdateLife(block->bbLiveIn);
+
+ // Even if liveness didn't change, we need to update the registers containing GC references.
+ // genUpdateLife will update the registers live due to liveness changes. But what about registers that didn't
+ // change? We cleared them out above. Maybe we should just not clear them out, but update the ones that change
+ // here. That would require handling the changes in recordVarLocationsAtStartOfBB().
+
+ regMaskTP newLiveRegSet = RBM_NONE;
+ regMaskTP newRegGCrefSet = RBM_NONE;
+ regMaskTP newRegByrefSet = RBM_NONE;
+#ifdef DEBUG
+ VARSET_TP VARSET_INIT_NOCOPY(removedGCVars, VarSetOps::MakeEmpty(compiler));
+ VARSET_TP VARSET_INIT_NOCOPY(addedGCVars, VarSetOps::MakeEmpty(compiler));
+#endif
+ VARSET_ITER_INIT(compiler, iter, block->bbLiveIn, varIndex);
+ while (iter.NextElem(compiler, &varIndex))
+ {
+ unsigned varNum = compiler->lvaTrackedToVarNum[varIndex];
+ LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
+
+ if (varDsc->lvIsInReg())
+ {
+ newLiveRegSet |= varDsc->lvRegMask();
+ if (varDsc->lvType == TYP_REF)
+ {
+ newRegGCrefSet |= varDsc->lvRegMask();
+ }
+ else if (varDsc->lvType == TYP_BYREF)
+ {
+ newRegByrefSet |= varDsc->lvRegMask();
+ }
+#ifdef DEBUG
+ if (verbose && VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varIndex))
+ {
+ VarSetOps::AddElemD(compiler, removedGCVars, varIndex);
+ }
+#endif // DEBUG
+ VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varIndex);
+ }
+ else if (compiler->lvaIsGCTracked(varDsc))
+ {
+#ifdef DEBUG
+ if (verbose && !VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varIndex))
+ {
+ VarSetOps::AddElemD(compiler, addedGCVars, varIndex);
+ }
+#endif // DEBUG
+ VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varIndex);
+ }
+ }
+
+ regSet.rsMaskVars = newLiveRegSet;
+
+#ifdef DEBUG
+ if (compiler->verbose)
+ {
+ if (!VarSetOps::IsEmpty(compiler, addedGCVars))
+ {
+ printf("\t\t\t\t\t\t\tAdded GCVars: ");
+ dumpConvertedVarSet(compiler, addedGCVars);
+ printf("\n");
+ }
+ if (!VarSetOps::IsEmpty(compiler, removedGCVars))
+ {
+ printf("\t\t\t\t\t\t\tRemoved GCVars: ");
+ dumpConvertedVarSet(compiler, removedGCVars);
+ printf("\n");
+ }
+ }
+#endif // DEBUG
+
+ gcInfo.gcMarkRegSetGCref(newRegGCrefSet DEBUGARG(true));
+ gcInfo.gcMarkRegSetByref(newRegByrefSet DEBUGARG(true));
+
+ /* Blocks with handlerGetsXcptnObj()==true use GT_CATCH_ARG to
+ represent the exception object (TYP_REF).
+ We mark REG_EXCEPTION_OBJECT as holding a GC object on entry
+ to the block, it will be the first thing evaluated
+ (thanks to GTF_ORDER_SIDEEFF).
+ */
+
+ if (handlerGetsXcptnObj(block->bbCatchTyp))
+ {
+ for (GenTree* node : LIR::AsRange(block))
+ {
+ if (node->OperGet() == GT_CATCH_ARG)
+ {
+ gcInfo.gcMarkRegSetGCref(RBM_EXCEPTION_OBJECT);
+ break;
+ }
+ }
+ }
+
+ /* Start a new code output block */
+
+ genUpdateCurrentFunclet(block);
+
+ if (genAlignLoops && block->bbFlags & BBF_LOOP_HEAD)
+ {
+ getEmitter()->emitLoopAlign();
+ }
+
+#ifdef DEBUG
+ if (compiler->opts.dspCode)
+ {
+ printf("\n L_M%03u_BB%02u:\n", Compiler::s_compMethodsCount, block->bbNum);
+ }
+#endif
+
+ block->bbEmitCookie = nullptr;
+
+ if (block->bbFlags & (BBF_JMP_TARGET | BBF_HAS_LABEL))
+ {
+ /* Mark a label and update the current set of live GC refs */
+
+ block->bbEmitCookie = getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur,
+ gcInfo.gcRegByrefSetCur, FALSE);
+ }
+
+ if (block == compiler->fgFirstColdBlock)
+ {
+#ifdef DEBUG
+ if (compiler->verbose)
+ {
+ printf("\nThis is the start of the cold region of the method\n");
+ }
+#endif
+ // We should never have a block that falls through into the Cold section
+ noway_assert(!lblk->bbFallsThrough());
+
+ // We require the block that starts the Cold section to have a label
+ noway_assert(block->bbEmitCookie);
+ getEmitter()->emitSetFirstColdIGCookie(block->bbEmitCookie);
+ }
+
+ /* Both stacks are always empty on entry to a basic block */
+
+ genStackLevel = 0;
+
+ savedStkLvl = genStackLevel;
+
+ /* Tell everyone which basic block we're working on */
+
+ compiler->compCurBB = block;
+
+#ifdef DEBUGGING_SUPPORT
+ siBeginBlock(block);
+
+ // BBF_INTERNAL blocks don't correspond to any single IL instruction.
+ if (compiler->opts.compDbgInfo && (block->bbFlags & BBF_INTERNAL) &&
+ !compiler->fgBBisScratch(block)) // If the block is the distinguished first scratch block, then no need to
+ // emit a NO_MAPPING entry, immediately after the prolog.
+ {
+ genIPmappingAdd((IL_OFFSETX)ICorDebugInfo::NO_MAPPING, true);
+ }
+
+ bool firstMapping = true;
+#endif // DEBUGGING_SUPPORT
+
+ /*---------------------------------------------------------------------
+ *
+ * Generate code for each statement-tree in the block
+ *
+ */
+ CLANG_FORMAT_COMMENT_ANCHOR;
+
+#if FEATURE_EH_FUNCLETS
+ if (block->bbFlags & BBF_FUNCLET_BEG)
+ {
+ genReserveFuncletProlog(block);
+ }
+#endif // FEATURE_EH_FUNCLETS
+
+ // Clear compCurStmt and compCurLifeTree.
+ compiler->compCurStmt = nullptr;
+ compiler->compCurLifeTree = nullptr;
+
+ // Traverse the block in linear order, generating code for each node as we
+ // as we encounter it.
+ CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef DEBUGGING_SUPPORT
+ IL_OFFSETX currentILOffset = BAD_IL_OFFSET;
+#endif
+ for (GenTree* node : LIR::AsRange(block).NonPhiNodes())
+ {
+#ifdef DEBUGGING_SUPPORT
+ // Do we have a new IL offset?
+ if (node->OperGet() == GT_IL_OFFSET)
+ {
+ genEnsureCodeEmitted(currentILOffset);
+ currentILOffset = node->gtStmt.gtStmtILoffsx;
+ genIPmappingAdd(currentILOffset, firstMapping);
+ firstMapping = false;
+ }
+#endif // DEBUGGING_SUPPORT
+
+#ifdef DEBUG
+ if (node->OperGet() == GT_IL_OFFSET)
+ {
+ noway_assert(node->gtStmt.gtStmtLastILoffs <= compiler->info.compILCodeSize ||
+ node->gtStmt.gtStmtLastILoffs == BAD_IL_OFFSET);
+
+ if (compiler->opts.dspCode && compiler->opts.dspInstrs &&
+ node->gtStmt.gtStmtLastILoffs != BAD_IL_OFFSET)
+ {
+ while (genCurDispOffset <= node->gtStmt.gtStmtLastILoffs)
+ {
+ genCurDispOffset += dumpSingleInstr(compiler->info.compCode, genCurDispOffset, "> ");
+ }
+ }
+ }
+#endif // DEBUG
+
+ genCodeForTreeNode(node);
+ if (node->gtHasReg() && node->gtLsraInfo.isLocalDefUse)
+ {
+ genConsumeReg(node);
+ }
+ } // end for each node in block
+
+#ifdef DEBUG
+ // The following set of register spill checks and GC pointer tracking checks used to be
+ // performed at statement boundaries. Now, with LIR, there are no statements, so they are
+ // performed at the end of each block.
+ // TODO: could these checks be performed more frequently? E.g., at each location where
+ // the register allocator says there are no live non-variable registers. Perhaps this could
+ // be done by (a) keeping a running count of live non-variable registers by using
+ // gtLsraInfo.srcCount and gtLsraInfo.dstCount to decrement and increment the count, respectively,
+ // and running the checks when the count is zero. Or, (b) use the map maintained by LSRA
+ // (operandToLocationInfoMap) to mark a node somehow when, after the execution of that node,
+ // there will be no live non-variable registers.
+
+ regSet.rsSpillChk();
+
+ /* Make sure we didn't bungle pointer register tracking */
+
+ regMaskTP ptrRegs = gcInfo.gcRegGCrefSetCur | gcInfo.gcRegByrefSetCur;
+ regMaskTP nonVarPtrRegs = ptrRegs & ~regSet.rsMaskVars;
+
+ // If return is a GC-type, clear it. Note that if a common
+ // epilog is generated (genReturnBB) it has a void return
+ // even though we might return a ref. We can't use the compRetType
+ // as the determiner because something we are tracking as a byref
+ // might be used as a return value of a int function (which is legal)
+ GenTree* blockLastNode = block->lastNode();
+ if ((blockLastNode != nullptr) && (blockLastNode->gtOper == GT_RETURN) &&
+ (varTypeIsGC(compiler->info.compRetType) ||
+ (blockLastNode->gtOp.gtOp1 != nullptr && varTypeIsGC(blockLastNode->gtOp.gtOp1->TypeGet()))))
+ {
+ nonVarPtrRegs &= ~RBM_INTRET;
+ }
+
+ if (nonVarPtrRegs)
+ {
+ printf("Regset after BB%02u gcr=", block->bbNum);
+ printRegMaskInt(gcInfo.gcRegGCrefSetCur & ~regSet.rsMaskVars);
+ compiler->getEmitter()->emitDispRegSet(gcInfo.gcRegGCrefSetCur & ~regSet.rsMaskVars);
+ printf(", byr=");
+ printRegMaskInt(gcInfo.gcRegByrefSetCur & ~regSet.rsMaskVars);
+ compiler->getEmitter()->emitDispRegSet(gcInfo.gcRegByrefSetCur & ~regSet.rsMaskVars);
+ printf(", regVars=");
+ printRegMaskInt(regSet.rsMaskVars);
+ compiler->getEmitter()->emitDispRegSet(regSet.rsMaskVars);
+ printf("\n");
+ }
+
+ noway_assert(nonVarPtrRegs == RBM_NONE);
+#endif // DEBUG
+
+#if defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_AMD64_)
+ if (block->bbNext == nullptr)
+ {
+ // Unit testing of the AMD64 emitter: generate a bunch of instructions into the last block
+ // (it's as good as any, but better than the prolog, which can only be a single instruction
+ // group) then use COMPlus_JitLateDisasm=* to see if the late disassembler
+ // thinks the instructions are the same as we do.
+ genAmd64EmitterUnitTests();
+ }
+#endif // defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_ARM64_)
+
+#ifdef DEBUGGING_SUPPORT
+ // It is possible to reach the end of the block without generating code for the current IL offset.
+ // For example, if the following IR ends the current block, no code will have been generated for
+ // offset 21:
+ //
+ // ( 0, 0) [000040] ------------ il_offset void IL offset: 21
+ //
+ // N001 ( 0, 0) [000039] ------------ nop void
+ //
+ // This can lead to problems when debugging the generated code. To prevent these issues, make sure
+ // we've generated code for the last IL offset we saw in the block.
+ genEnsureCodeEmitted(currentILOffset);
+
+ if (compiler->opts.compScopeInfo && (compiler->info.compVarScopesCount > 0))
+ {
+ siEndBlock(block);
+
+ /* Is this the last block, and are there any open scopes left ? */
+
+ bool isLastBlockProcessed = (block->bbNext == nullptr);
+ if (block->isBBCallAlwaysPair())
+ {
+ isLastBlockProcessed = (block->bbNext->bbNext == nullptr);
+ }
+
+ if (isLastBlockProcessed && siOpenScopeList.scNext)
+ {
+ /* This assert no longer holds, because we may insert a throw
+ block to demarcate the end of a try or finally region when they
+ are at the end of the method. It would be nice if we could fix
+ our code so that this throw block will no longer be necessary. */
+
+ // noway_assert(block->bbCodeOffsEnd != compiler->info.compILCodeSize);
+
+ siCloseAllOpenScopes();
+ }
+ }
+
+#endif // DEBUGGING_SUPPORT
+
+ genStackLevel -= savedStkLvl;
+
+#ifdef DEBUG
+ // compCurLife should be equal to the liveOut set, except that we don't keep
+ // it up to date for vars that are not register candidates
+ // (it would be nice to have a xor set function)
+
+ VARSET_TP VARSET_INIT_NOCOPY(extraLiveVars, VarSetOps::Diff(compiler, block->bbLiveOut, compiler->compCurLife));
+ VarSetOps::UnionD(compiler, extraLiveVars, VarSetOps::Diff(compiler, compiler->compCurLife, block->bbLiveOut));
+ VARSET_ITER_INIT(compiler, extraLiveVarIter, extraLiveVars, extraLiveVarIndex);
+ while (extraLiveVarIter.NextElem(compiler, &extraLiveVarIndex))
+ {
+ unsigned varNum = compiler->lvaTrackedToVarNum[extraLiveVarIndex];
+ LclVarDsc* varDsc = compiler->lvaTable + varNum;
+ assert(!varDsc->lvIsRegCandidate());
+ }
+#endif
+
+ /* Both stacks should always be empty on exit from a basic block */
+ noway_assert(genStackLevel == 0);
+
+#ifdef _TARGET_AMD64_
+ // On AMD64, we need to generate a NOP after a call that is the last instruction of the block, in several
+ // situations, to support proper exception handling semantics. This is mostly to ensure that when the stack
+ // walker computes an instruction pointer for a frame, that instruction pointer is in the correct EH region.
+ // The document "X64 and ARM ABIs.docx" has more details. The situations:
+ // 1. If the call instruction is in a different EH region as the instruction that follows it.
+ // 2. If the call immediately precedes an OS epilog. (Note that what the JIT or VM consider an epilog might
+ // be slightly different from what the OS considers an epilog, and it is the OS-reported epilog that matters
+ // here.)
+ // We handle case #1 here, and case #2 in the emitter.
+ if (getEmitter()->emitIsLastInsCall())
+ {
+ // Ok, the last instruction generated is a call instruction. Do any of the other conditions hold?
+ // Note: we may be generating a few too many NOPs for the case of call preceding an epilog. Technically,
+ // if the next block is a BBJ_RETURN, an epilog will be generated, but there may be some instructions
+ // generated before the OS epilog starts, such as a GS cookie check.
+ if ((block->bbNext == nullptr) || !BasicBlock::sameEHRegion(block, block->bbNext))
+ {
+ // We only need the NOP if we're not going to generate any more code as part of the block end.
+
+ switch (block->bbJumpKind)
+ {
+ case BBJ_ALWAYS:
+ case BBJ_THROW:
+ case BBJ_CALLFINALLY:
+ case BBJ_EHCATCHRET:
+ // We're going to generate more code below anyway, so no need for the NOP.
+
+ case BBJ_RETURN:
+ case BBJ_EHFINALLYRET:
+ case BBJ_EHFILTERRET:
+ // These are the "epilog follows" case, handled in the emitter.
+
+ break;
+
+ case BBJ_NONE:
+ if (block->bbNext == nullptr)
+ {
+ // Call immediately before the end of the code; we should never get here .
+ instGen(INS_BREAKPOINT); // This should never get executed
+ }
+ else
+ {
+ // We need the NOP
+ instGen(INS_nop);
+ }
+ break;
+
+ case BBJ_COND:
+ case BBJ_SWITCH:
+ // These can't have a call as the last instruction!
+
+ default:
+ noway_assert(!"Unexpected bbJumpKind");
+ break;
+ }
+ }
+ }
+#endif // _TARGET_AMD64_
+
+ /* Do we need to generate a jump or return? */
+
+ switch (block->bbJumpKind)
+ {
+ case BBJ_ALWAYS:
+ inst_JMP(EJ_jmp, block->bbJumpDest);
+ break;
+
+ case BBJ_RETURN:
+ genExitCode(block);
+ break;
+
+ case BBJ_THROW:
+ // If we have a throw at the end of a function or funclet, we need to emit another instruction
+ // afterwards to help the OS unwinder determine the correct context during unwind.
+ // We insert an unexecuted breakpoint instruction in several situations
+ // following a throw instruction:
+ // 1. If the throw is the last instruction of the function or funclet. This helps
+ // the OS unwinder determine the correct context during an unwind from the
+ // thrown exception.
+ // 2. If this is this is the last block of the hot section.
+ // 3. If the subsequent block is a special throw block.
+ // 4. On AMD64, if the next block is in a different EH region.
+ if ((block->bbNext == nullptr) || (block->bbNext->bbFlags & BBF_FUNCLET_BEG) ||
+ !BasicBlock::sameEHRegion(block, block->bbNext) ||
+ (!isFramePointerUsed() && compiler->fgIsThrowHlpBlk(block->bbNext)) ||
+ block->bbNext == compiler->fgFirstColdBlock)
+ {
+ instGen(INS_BREAKPOINT); // This should never get executed
+ }
+
+ break;
+
+ case BBJ_CALLFINALLY:
+
+#if FEATURE_EH_FUNCLETS
+
+ // Generate a call to the finally, like this:
+ // mov rcx,qword ptr [rbp + 20H] // Load rcx with PSPSym
+ // call finally-funclet
+ // jmp finally-return // Only for non-retless finally calls
+ // The jmp can be a NOP if we're going to the next block.
+ // If we're generating code for the main function (not a funclet), and there is no localloc,
+ // then RSP at this point is the same value as that stored in the PSPsym. So just copy RSP
+ // instead of loading the PSPSym in this case.
+
+ if (!compiler->compLocallocUsed && (compiler->funCurrentFunc()->funKind == FUNC_ROOT))
+ {
+ inst_RV_RV(INS_mov, REG_ARG_0, REG_SPBASE, TYP_I_IMPL);
+ }
+ else
+ {
+ getEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_ARG_0, compiler->lvaPSPSym, 0);
+ }
+ getEmitter()->emitIns_J(INS_call, block->bbJumpDest);
+
+ if (block->bbFlags & BBF_RETLESS_CALL)
+ {
+ // We have a retless call, and the last instruction generated was a call.
+ // If the next block is in a different EH region (or is the end of the code
+ // block), then we need to generate a breakpoint here (since it will never
+ // get executed) to get proper unwind behavior.
+
+ if ((block->bbNext == nullptr) || !BasicBlock::sameEHRegion(block, block->bbNext))
+ {
+ instGen(INS_BREAKPOINT); // This should never get executed
+ }
+ }
+ else
+ {
+ // Because of the way the flowgraph is connected, the liveness info for this one instruction
+ // after the call is not (can not be) correct in cases where a variable has a last use in the
+ // handler. So turn off GC reporting for this single instruction.
+ getEmitter()->emitDisableGC();
+
+ // Now go to where the finally funclet needs to return to.
+ if (block->bbNext->bbJumpDest == block->bbNext->bbNext)
+ {
+ // Fall-through.
+ // TODO-XArch-CQ: Can we get rid of this instruction, and just have the call return directly
+ // to the next instruction? This would depend on stack walking from within the finally
+ // handler working without this instruction being in this special EH region.
+ instGen(INS_nop);
+ }
+ else
+ {
+ inst_JMP(EJ_jmp, block->bbNext->bbJumpDest);
+ }
+
+ getEmitter()->emitEnableGC();
+ }
+
+#else // !FEATURE_EH_FUNCLETS
+
+ // If we are about to invoke a finally locally from a try block, we have to set the ShadowSP slot
+ // corresponding to the finally's nesting level. When invoked in response to an exception, the
+ // EE does this.
+ //
+ // We have a BBJ_CALLFINALLY followed by a BBJ_ALWAYS.
+ //
+ // We will emit :
+ // mov [ebp - (n + 1)], 0
+ // mov [ebp - n ], 0xFC
+ // push &step
+ // jmp finallyBlock
+ // ...
+ // step:
+ // mov [ebp - n ], 0
+ // jmp leaveTarget
+ // ...
+ // leaveTarget:
+
+ noway_assert(isFramePointerUsed());
+
+ // Get the nesting level which contains the finally
+ compiler->fgGetNestingLevel(block, &finallyNesting);
+
+ // The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
+ unsigned filterEndOffsetSlotOffs;
+ filterEndOffsetSlotOffs =
+ (unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
+
+ unsigned curNestingSlotOffs;
+ curNestingSlotOffs = (unsigned)(filterEndOffsetSlotOffs - ((finallyNesting + 1) * TARGET_POINTER_SIZE));
+
+ // Zero out the slot for the next nesting level
+ instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, 0, compiler->lvaShadowSPslotsVar,
+ curNestingSlotOffs - TARGET_POINTER_SIZE);
+ instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, LCL_FINALLY_MARK, compiler->lvaShadowSPslotsVar,
+ curNestingSlotOffs);
+
+ // Now push the address where the finally funclet should return to directly.
+ if (!(block->bbFlags & BBF_RETLESS_CALL))
+ {
+ assert(block->isBBCallAlwaysPair());
+ getEmitter()->emitIns_J(INS_push_hide, block->bbNext->bbJumpDest);
+ }
+ else
+ {
+ // EE expects a DWORD, so we give him 0
+ inst_IV(INS_push_hide, 0);
+ }
+
+ // Jump to the finally BB
+ inst_JMP(EJ_jmp, block->bbJumpDest);
+
+#endif // !FEATURE_EH_FUNCLETS
+
+ // The BBJ_ALWAYS is used because the BBJ_CALLFINALLY can't point to the
+ // jump target using bbJumpDest - that is already used to point
+ // to the finally block. So just skip past the BBJ_ALWAYS unless the
+ // block is RETLESS.
+ if (!(block->bbFlags & BBF_RETLESS_CALL))
+ {
+ assert(block->isBBCallAlwaysPair());
+
+ lblk = block;
+ block = block->bbNext;
+ }
+
+ break;
+
+#if FEATURE_EH_FUNCLETS
+
+ case BBJ_EHCATCHRET:
+ // Set RAX to the address the VM should return to after the catch.
+ // Generate a RIP-relative
+ // lea reg, [rip + disp32] ; the RIP is implicit
+ // which will be position-indepenent.
+ getEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, block->bbJumpDest, REG_INTRET);
+ __fallthrough;
+
+ case BBJ_EHFINALLYRET:
+ case BBJ_EHFILTERRET:
+ genReserveFuncletEpilog(block);
+ break;
+
+#else // !FEATURE_EH_FUNCLETS
+
+ case BBJ_EHCATCHRET:
+ noway_assert(!"Unexpected BBJ_EHCATCHRET"); // not used on x86
+
+ case BBJ_EHFINALLYRET:
+ case BBJ_EHFILTERRET:
+ {
+ // The last statement of the block must be a GT_RETFILT, which has already been generated.
+ assert(block->lastNode() != nullptr);
+ assert(block->lastNode()->OperGet() == GT_RETFILT);
+
+ if (block->bbJumpKind == BBJ_EHFINALLYRET)
+ {
+ assert(block->lastNode()->gtOp.gtOp1 == nullptr); // op1 == nullptr means endfinally
+
+ // Return using a pop-jmp sequence. As the "try" block calls
+ // the finally with a jmp, this leaves the x86 call-ret stack
+ // balanced in the normal flow of path.
+
+ noway_assert(isFramePointerRequired());
+ inst_RV(INS_pop_hide, REG_EAX, TYP_I_IMPL);
+ inst_RV(INS_i_jmp, REG_EAX, TYP_I_IMPL);
+ }
+ else
+ {
+ assert(block->bbJumpKind == BBJ_EHFILTERRET);
+
+ // The return value has already been computed.
+ instGen_Return(0);
+ }
+ }
+ break;
+
+#endif // !FEATURE_EH_FUNCLETS
+
+ case BBJ_NONE:
+ case BBJ_COND:
+ case BBJ_SWITCH:
+ break;
+
+ default:
+ noway_assert(!"Unexpected bbJumpKind");
+ break;
+ }
+
+#ifdef DEBUG
+ compiler->compCurBB = nullptr;
+#endif
+
+ } //------------------ END-FOR each block of the method -------------------
+
+ /* Nothing is live at this point */
+ genUpdateLife(VarSetOps::MakeEmpty(compiler));
+
+ /* Finalize the spill tracking logic */
+
+ regSet.rsSpillEnd();
+
+ /* Finalize the temp tracking logic */
+
+ compiler->tmpEnd();
+
+#ifdef DEBUG
+ if (compiler->verbose)
+ {
+ printf("\n# ");
+ printf("compCycleEstimate = %6d, compSizeEstimate = %5d ", compiler->compCycleEstimate,
+ compiler->compSizeEstimate);
+ printf("%s\n", compiler->info.compFullName);
+ }
+#endif
+}
+
+// return the child that has the same reg as the dst (if any)
+// other child returned (out param) in 'other'
+GenTree* sameRegAsDst(GenTree* tree, GenTree*& other /*out*/)
+{
+ if (tree->gtRegNum == REG_NA)
+ {
+ other = nullptr;
+ return nullptr;
+ }
+
+ GenTreePtr op1 = tree->gtOp.gtOp1;
+ GenTreePtr op2 = tree->gtOp.gtOp2;
+ if (op1->gtRegNum == tree->gtRegNum)
+ {
+ other = op2;
+ return op1;
+ }
+ if (op2->gtRegNum == tree->gtRegNum)
+ {
+ other = op1;
+ return op2;
+ }
+ else
+ {
+ other = nullptr;
+ return nullptr;
+ }
+}
+
+// Move an immediate value into an integer register
+
+void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, regNumber reg, ssize_t imm, insFlags flags)
+{
+ // reg cannot be a FP register
+ assert(!genIsValidFloatReg(reg));
+
+ if (!compiler->opts.compReloc)
+ {
+ size = EA_SIZE(size); // Strip any Reloc flags from size if we aren't doing relocs
+ }
+
+ if ((imm == 0) && !EA_IS_RELOC(size))
+ {
+ instGen_Set_Reg_To_Zero(size, reg, flags);
+ }
+ else
+ {
+ if (genDataIndirAddrCanBeEncodedAsPCRelOffset(imm))
+ {
+ getEmitter()->emitIns_R_AI(INS_lea, EA_PTR_DSP_RELOC, reg, imm);
+ }
+ else
+ {
+ getEmitter()->emitIns_R_I(INS_mov, size, reg, imm);
+ }
+ }
+ regTracker.rsTrackRegIntCns(reg, imm);
+}
+
+/***********************************************************************************
+ *
+ * Generate code to set a register 'targetReg' of type 'targetType' to the constant
+ * specified by the constant (GT_CNS_INT or GT_CNS_DBL) in 'tree'. This does not call
+ * genProduceReg() on the target register.
+ */
+void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTreePtr tree)
+{
+
+ switch (tree->gtOper)
+ {
+ case GT_CNS_INT:
+ {
+ // relocatable values tend to come down as a CNS_INT of native int type
+ // so the line between these two opcodes is kind of blurry
+ GenTreeIntConCommon* con = tree->AsIntConCommon();
+ ssize_t cnsVal = con->IconValue();
+
+ if (con->ImmedValNeedsReloc(compiler))
+ {
+ instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, targetReg, cnsVal);
+ regTracker.rsTrackRegTrash(targetReg);
+ }
+ else
+ {
+ genSetRegToIcon(targetReg, cnsVal, targetType);
+ }
+ }
+ break;
+
+ case GT_CNS_DBL:
+ {
+ double constValue = tree->gtDblCon.gtDconVal;
+
+ // Make sure we use "xorpd reg, reg" only for +ve zero constant (0.0) and not for -ve zero (-0.0)
+ if (*(__int64*)&constValue == 0)
+ {
+ // A faster/smaller way to generate 0
+ instruction ins = genGetInsForOper(GT_XOR, targetType);
+ inst_RV_RV(ins, targetReg, targetReg, targetType);
+ }
+ else
+ {
+ GenTreePtr cns;
+ if (targetType == TYP_FLOAT)
+ {
+ float f = forceCastToFloat(constValue);
+ cns = genMakeConst(&f, targetType, tree, false);
+ }
+ else
+ {
+ cns = genMakeConst(&constValue, targetType, tree, true);
+ }
+
+ inst_RV_TT(ins_Load(targetType), targetReg, cns);
+ }
+ }
+ break;
+
+ default:
+ unreached();
+ }
+}
+
+// Generate code to get the high N bits of a N*N=2N bit multiplication result
+void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
+{
+ assert(!(treeNode->gtFlags & GTF_UNSIGNED));
+ assert(!treeNode->gtOverflowEx());
+
+ regNumber targetReg = treeNode->gtRegNum;
+ var_types targetType = treeNode->TypeGet();
+ emitter* emit = getEmitter();
+ emitAttr size = emitTypeSize(treeNode);
+ GenTree* op1 = treeNode->gtOp.gtOp1;
+ GenTree* op2 = treeNode->gtOp.gtOp2;
+
+ // to get the high bits of the multiply, we are constrained to using the
+ // 1-op form: RDX:RAX = RAX * rm
+ // The 3-op form (Rx=Ry*Rz) does not support it.
+
+ genConsumeOperands(treeNode->AsOp());
+
+ GenTree* regOp = op1;
+ GenTree* rmOp = op2;
+
+ // Set rmOp to the contained memory operand (if any)
+ //
+ if (op1->isContained() || (!op2->isContained() && (op2->gtRegNum == targetReg)))
+ {
+ regOp = op2;
+ rmOp = op1;
+ }
+ assert(!regOp->isContained());
+
+ // Setup targetReg when neither of the source operands was a matching register
+ if (regOp->gtRegNum != targetReg)
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, regOp->gtRegNum, targetType);
+ }
+
+ emit->emitInsBinary(INS_imulEAX, size, treeNode, rmOp);
+
+ // Move the result to the desired register, if necessary
+ if (targetReg != REG_RDX)
+ {
+ inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
+ }
+}
+
+// generate code for a DIV or MOD operation
+//
+void CodeGen::genCodeForDivMod(GenTreeOp* treeNode)
+{
+ GenTree* dividend = treeNode->gtOp1;
+ GenTree* divisor = treeNode->gtOp2;
+ genTreeOps oper = treeNode->OperGet();
+ emitAttr size = emitTypeSize(treeNode);
+ regNumber targetReg = treeNode->gtRegNum;
+ var_types targetType = treeNode->TypeGet();
+ emitter* emit = getEmitter();
+
+ // dividend is not contained.
+ assert(!dividend->isContained());
+
+ genConsumeOperands(treeNode->AsOp());
+ if (varTypeIsFloating(targetType))
+ {
+ // divisor is not contained or if contained is a memory op.
+ // Note that a reg optional operand is a treated as a memory op
+ // if no register is allocated to it.
+ assert(!divisor->isContained() || divisor->isMemoryOp() || divisor->IsCnsFltOrDbl() ||
+ divisor->IsRegOptional());
+
+ // Floating point div/rem operation
+ assert(oper == GT_DIV || oper == GT_MOD);
+
+ if (dividend->gtRegNum == targetReg)
+ {
+ emit->emitInsBinary(genGetInsForOper(treeNode->gtOper, targetType), size, treeNode, divisor);
+ }
+ else if (!divisor->isContained() && divisor->gtRegNum == targetReg)
+ {
+ // It is not possible to generate 2-operand divss or divsd where reg2 = reg1 / reg2
+ // because divss/divsd reg1, reg2 will over-write reg1. Therefore, in case of AMD64
+ // LSRA has to make sure that such a register assignment is not generated for floating
+ // point div/rem operations.
+ noway_assert(
+ !"GT_DIV/GT_MOD (float): case of reg2 = reg1 / reg2, LSRA should never generate such a reg assignment");
+ }
+ else
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, dividend->gtRegNum, targetType);
+ emit->emitInsBinary(genGetInsForOper(treeNode->gtOper, targetType), size, treeNode, divisor);
+ }
+ }
+ else
+ {
+ // dividend must be in RAX
+ if (dividend->gtRegNum != REG_RAX)
+ {
+ inst_RV_RV(INS_mov, REG_RAX, dividend->gtRegNum, targetType);
+ }
+
+ // zero or sign extend rax to rdx
+ if (oper == GT_UMOD || oper == GT_UDIV)
+ {
+ instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
+ }
+ else
+ {
+ emit->emitIns(INS_cdq, size);
+ // the cdq instruction writes RDX, So clear the gcInfo for RDX
+ gcInfo.gcMarkRegSetNpt(RBM_RDX);
+ }
+
+ // Perform the 'targetType' (64-bit or 32-bit) divide instruction
+ instruction ins;
+ if (oper == GT_UMOD || oper == GT_UDIV)
+ {
+ ins = INS_div;
+ }
+ else
+ {
+ ins = INS_idiv;
+ }
+
+ emit->emitInsBinary(ins, size, treeNode, divisor);
+
+ // DIV/IDIV instructions always store the quotient in RAX and the remainder in RDX.
+ // Move the result to the desired register, if necessary
+ if (oper == GT_DIV || oper == GT_UDIV)
+ {
+ if (targetReg != REG_RAX)
+ {
+ inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
+ }
+ }
+ else
+ {
+ assert((oper == GT_MOD) || (oper == GT_UMOD));
+ if (targetReg != REG_RDX)
+ {
+ inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
+ }
+ }
+ }
+ genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genCodeForBinary: Generate code for many binary arithmetic operators
+// This method is expected to have called genConsumeOperands() before calling it.
+//
+// Arguments:
+// treeNode - The binary operation for which we are generating code.
+//
+// Return Value:
+// None.
+//
+// Notes:
+// Mul and div variants have special constraints on x64 so are not handled here.
+// See teh assert below for the operators that are handled.
+
+void CodeGen::genCodeForBinary(GenTree* treeNode)
+{
+ const genTreeOps oper = treeNode->OperGet();
+ regNumber targetReg = treeNode->gtRegNum;
+ var_types targetType = treeNode->TypeGet();
+ emitter* emit = getEmitter();
+
+#if defined(_TARGET_64BIT_)
+ assert(oper == GT_OR || oper == GT_XOR || oper == GT_AND || oper == GT_ADD || oper == GT_SUB);
+#else // !defined(_TARGET_64BIT_)
+ assert(oper == GT_OR || oper == GT_XOR || oper == GT_AND || oper == GT_ADD_LO || oper == GT_ADD_HI ||
+ oper == GT_SUB_LO || oper == GT_SUB_HI || oper == GT_MUL_HI || oper == GT_DIV_HI || oper == GT_MOD_HI ||
+ oper == GT_ADD || oper == GT_SUB);
+#endif // !defined(_TARGET_64BIT_)
+
+ GenTreePtr op1 = treeNode->gtGetOp1();
+ GenTreePtr op2 = treeNode->gtGetOp2();
+
+ // Commutative operations can mark op1 as contained to generate "op reg, memop/immed"
+ if (op1->isContained())
+ {
+ assert(treeNode->OperIsCommutative());
+ assert(op1->isMemoryOp() || op1->IsCnsNonZeroFltOrDbl() || op1->IsIntCnsFitsInI32() || op1->IsRegOptional());
+
+ op1 = treeNode->gtGetOp2();
+ op2 = treeNode->gtGetOp1();
+ }
+
+ instruction ins = genGetInsForOper(treeNode->OperGet(), targetType);
+
+ // The arithmetic node must be sitting in a register (since it's not contained)
+ noway_assert(targetReg != REG_NA);
+
+ regNumber op1reg = op1->isContained() ? REG_NA : op1->gtRegNum;
+ regNumber op2reg = op2->isContained() ? REG_NA : op2->gtRegNum;
+
+ GenTreePtr dst;
+ GenTreePtr src;
+
+ // This is the case of reg1 = reg1 op reg2
+ // We're ready to emit the instruction without any moves
+ if (op1reg == targetReg)
+ {
+ dst = op1;
+ src = op2;
+ }
+ // We have reg1 = reg2 op reg1
+ // In order for this operation to be correct
+ // we need that op is a commutative operation so
+ // we can convert it into reg1 = reg1 op reg2 and emit
+ // the same code as above
+ else if (op2reg == targetReg)
+ {
+ noway_assert(GenTree::OperIsCommutative(oper));
+ dst = op2;
+ src = op1;
+ }
+ // now we know there are 3 different operands so attempt to use LEA
+ else if (oper == GT_ADD && !varTypeIsFloating(treeNode) && !treeNode->gtOverflowEx() // LEA does not set flags
+ && (op2->isContainedIntOrIImmed() || !op2->isContained()))
+ {
+ if (op2->isContainedIntOrIImmed())
+ {
+ emit->emitIns_R_AR(INS_lea, emitTypeSize(treeNode), targetReg, op1reg,
+ (int)op2->AsIntConCommon()->IconValue());
+ }
+ else
+ {
+ assert(op2reg != REG_NA);
+ emit->emitIns_R_ARX(INS_lea, emitTypeSize(treeNode), targetReg, op1reg, op2reg, 1, 0);
+ }
+ genProduceReg(treeNode);
+ return;
+ }
+ // dest, op1 and op2 registers are different:
+ // reg3 = reg1 op reg2
+ // We can implement this by issuing a mov:
+ // reg3 = reg1
+ // reg3 = reg3 op reg2
+ else
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, op1reg, targetType);
+ regTracker.rsTrackRegCopy(targetReg, op1reg);
+ gcInfo.gcMarkRegPtrVal(targetReg, targetType);
+ dst = treeNode;
+ src = op2;
+ }
+
+ // try to use an inc or dec
+ if (oper == GT_ADD && !varTypeIsFloating(treeNode) && src->isContainedIntOrIImmed() && !treeNode->gtOverflowEx())
+ {
+ if (src->IsIntegralConst(1))
+ {
+ emit->emitIns_R(INS_inc, emitTypeSize(treeNode), targetReg);
+ genProduceReg(treeNode);
+ return;
+ }
+ else if (src->IsIntegralConst(-1))
+ {
+ emit->emitIns_R(INS_dec, emitTypeSize(treeNode), targetReg);
+ genProduceReg(treeNode);
+ return;
+ }
+ }
+ regNumber r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src);
+ noway_assert(r == targetReg);
+
+ if (treeNode->gtOverflowEx())
+ {
+#if !defined(_TARGET_64BIT_)
+ assert(oper == GT_ADD || oper == GT_SUB || oper == GT_ADD_HI || oper == GT_SUB_HI);
+#else
+ assert(oper == GT_ADD || oper == GT_SUB);
+#endif
+ genCheckOverflow(treeNode);
+ }
+ genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// isStructReturn: Returns whether the 'treeNode' is returning a struct.
+//
+// Arguments:
+// treeNode - The tree node to evaluate whether is a struct return.
+//
+// Return Value:
+// For AMD64 *nix: returns true if the 'treeNode" is a GT_RETURN node, of type struct.
+// Otherwise returns false.
+// For other platforms always returns false.
+//
+bool CodeGen::isStructReturn(GenTreePtr treeNode)
+{
+ // This method could be called for 'treeNode' of GT_RET_FILT or GT_RETURN.
+ // For the GT_RET_FILT, the return is always
+ // a bool or a void, for the end of a finally block.
+ noway_assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
+ if (treeNode->OperGet() != GT_RETURN)
+ {
+ return false;
+ }
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+ return varTypeIsStruct(treeNode);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+ assert(!varTypeIsStruct(treeNode));
+ return false;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+}
+
+//------------------------------------------------------------------------
+// genStructReturn: Generates code for returning a struct.
+//
+// Arguments:
+// treeNode - The GT_RETURN tree node.
+//
+// Return Value:
+// None
+//
+// Assumption:
+// op1 of GT_RETURN node is either GT_LCL_VAR or multi-reg GT_CALL
+void CodeGen::genStructReturn(GenTreePtr treeNode)
+{
+ assert(treeNode->OperGet() == GT_RETURN);
+ GenTreePtr op1 = treeNode->gtGetOp1();
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+ if (op1->OperGet() == GT_LCL_VAR)
+ {
+ GenTreeLclVarCommon* lclVar = op1->AsLclVarCommon();
+ LclVarDsc* varDsc = &(compiler->lvaTable[lclVar->gtLclNum]);
+ assert(varDsc->lvIsMultiRegRet);
+
+ ReturnTypeDesc retTypeDesc;
+ retTypeDesc.InitializeStructReturnType(compiler, varDsc->lvVerTypeInfo.GetClassHandle());
+ unsigned regCount = retTypeDesc.GetReturnRegCount();
+ assert(regCount == MAX_RET_REG_COUNT);
+
+ if (varTypeIsEnregisterableStruct(op1))
+ {
+ // Right now the only enregistrable structs supported are SIMD vector types.
+ assert(varTypeIsSIMD(op1));
+ assert(!op1->isContained());
+
+ // This is a case of operand is in a single reg and needs to be
+ // returned in multiple ABI return registers.
+ regNumber opReg = genConsumeReg(op1);
+ regNumber reg0 = retTypeDesc.GetABIReturnReg(0);
+ regNumber reg1 = retTypeDesc.GetABIReturnReg(1);
+
+ if (opReg != reg0 && opReg != reg1)
+ {
+ // Operand reg is different from return regs.
+ // Copy opReg to reg0 and let it to be handled by one of the
+ // two cases below.
+ inst_RV_RV(ins_Copy(TYP_DOUBLE), reg0, opReg, TYP_DOUBLE);
+ opReg = reg0;
+ }
+
+ if (opReg == reg0)
+ {
+ assert(opReg != reg1);
+
+ // reg0 - already has required 8-byte in bit position [63:0].
+ // reg1 = opReg.
+ // swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
+ inst_RV_RV(ins_Copy(TYP_DOUBLE), reg1, opReg, TYP_DOUBLE);
+ }
+ else
+ {
+ assert(opReg == reg1);
+
+ // reg0 = opReg.
+ // swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
+ inst_RV_RV(ins_Copy(TYP_DOUBLE), reg0, opReg, TYP_DOUBLE);
+ }
+ inst_RV_RV_IV(INS_shufpd, EA_16BYTE, reg1, reg1, 0x01);
+ }
+ else
+ {
+ assert(op1->isContained());
+
+ // Copy var on stack into ABI return registers
+ int offset = 0;
+ for (unsigned i = 0; i < regCount; ++i)
+ {
+ var_types type = retTypeDesc.GetReturnRegType(i);
+ regNumber reg = retTypeDesc.GetABIReturnReg(i);
+ getEmitter()->emitIns_R_S(ins_Load(type), emitTypeSize(type), reg, lclVar->gtLclNum, offset);
+ offset += genTypeSize(type);
+ }
+ }
+ }
+ else
+ {
+ assert(op1->IsMultiRegCall() || op1->IsCopyOrReloadOfMultiRegCall());
+
+ genConsumeRegs(op1);
+
+ GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
+ GenTreeCall* call = actualOp1->AsCall();
+ ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+ unsigned regCount = retTypeDesc->GetReturnRegCount();
+ assert(regCount == MAX_RET_REG_COUNT);
+
+ // Handle circular dependency between call allocated regs and ABI return regs.
+ //
+ // It is possible under LSRA stress that originally allocated regs of call node,
+ // say rax and rdx, are spilled and reloaded to rdx and rax respectively. But
+ // GT_RETURN needs to move values as follows: rdx->rax, rax->rdx. Similar kind
+ // kind of circular dependency could arise between xmm0 and xmm1 return regs.
+ // Codegen is expected to handle such circular dependency.
+ //
+ var_types regType0 = retTypeDesc->GetReturnRegType(0);
+ regNumber returnReg0 = retTypeDesc->GetABIReturnReg(0);
+ regNumber allocatedReg0 = call->GetRegNumByIdx(0);
+
+ var_types regType1 = retTypeDesc->GetReturnRegType(1);
+ regNumber returnReg1 = retTypeDesc->GetABIReturnReg(1);
+ regNumber allocatedReg1 = call->GetRegNumByIdx(1);
+
+ if (op1->IsCopyOrReload())
+ {
+ // GT_COPY/GT_RELOAD will have valid reg for those positions
+ // that need to be copied or reloaded.
+ regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(0);
+ if (reloadReg != REG_NA)
+ {
+ allocatedReg0 = reloadReg;
+ }
+
+ reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(1);
+ if (reloadReg != REG_NA)
+ {
+ allocatedReg1 = reloadReg;
+ }
+ }
+
+ if (allocatedReg0 == returnReg1 && allocatedReg1 == returnReg0)
+ {
+ // Circular dependency - swap allocatedReg0 and allocatedReg1
+ if (varTypeIsFloating(regType0))
+ {
+ assert(varTypeIsFloating(regType1));
+
+ // The fastest way to swap two XMM regs is using PXOR
+ inst_RV_RV(INS_pxor, allocatedReg0, allocatedReg1, TYP_DOUBLE);
+ inst_RV_RV(INS_pxor, allocatedReg1, allocatedReg0, TYP_DOUBLE);
+ inst_RV_RV(INS_pxor, allocatedReg0, allocatedReg1, TYP_DOUBLE);
+ }
+ else
+ {
+ assert(varTypeIsIntegral(regType0));
+ assert(varTypeIsIntegral(regType1));
+ inst_RV_RV(INS_xchg, allocatedReg1, allocatedReg0, TYP_I_IMPL);
+ }
+ }
+ else if (allocatedReg1 == returnReg0)
+ {
+ // Change the order of moves to correctly handle dependency.
+ if (allocatedReg1 != returnReg1)
+ {
+ inst_RV_RV(ins_Copy(regType1), returnReg1, allocatedReg1, regType1);
+ }
+
+ if (allocatedReg0 != returnReg0)
+ {
+ inst_RV_RV(ins_Copy(regType0), returnReg0, allocatedReg0, regType0);
+ }
+ }
+ else
+ {
+ // No circular dependency case.
+ if (allocatedReg0 != returnReg0)
+ {
+ inst_RV_RV(ins_Copy(regType0), returnReg0, allocatedReg0, regType0);
+ }
+
+ if (allocatedReg1 != returnReg1)
+ {
+ inst_RV_RV(ins_Copy(regType1), returnReg1, allocatedReg1, regType1);
+ }
+ }
+ }
+#else
+ unreached();
+#endif
+}
+
+//------------------------------------------------------------------------
+// genReturn: Generates code for return statement.
+// In case of struct return, delegates to the genStructReturn method.
+//
+// Arguments:
+// treeNode - The GT_RETURN or GT_RETFILT tree node.
+//
+// Return Value:
+// None
+//
+void CodeGen::genReturn(GenTreePtr treeNode)
+{
+ assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
+ GenTreePtr op1 = treeNode->gtGetOp1();
+ var_types targetType = treeNode->TypeGet();
+
+#ifdef DEBUG
+ if (targetType == TYP_VOID)
+ {
+ assert(op1 == nullptr);
+ }
+#endif
+
+#ifdef _TARGET_X86_
+ if (treeNode->TypeGet() == TYP_LONG)
+ {
+ assert(op1 != nullptr);
+ noway_assert(op1->OperGet() == GT_LONG);
+ GenTree* loRetVal = op1->gtGetOp1();
+ GenTree* hiRetVal = op1->gtGetOp2();
+ noway_assert((loRetVal->gtRegNum != REG_NA) && (hiRetVal->gtRegNum != REG_NA));
+
+ genConsumeReg(loRetVal);
+ genConsumeReg(hiRetVal);
+ if (loRetVal->gtRegNum != REG_LNGRET_LO)
+ {
+ inst_RV_RV(ins_Copy(targetType), REG_LNGRET_LO, loRetVal->gtRegNum, TYP_INT);
+ }
+ if (hiRetVal->gtRegNum != REG_LNGRET_HI)
+ {
+ inst_RV_RV(ins_Copy(targetType), REG_LNGRET_HI, hiRetVal->gtRegNum, TYP_INT);
+ }
+ }
+ else
+#endif // !defined(_TARGET_X86_)
+ {
+ if (isStructReturn(treeNode))
+ {
+ genStructReturn(treeNode);
+ }
+ else if (targetType != TYP_VOID)
+ {
+ assert(op1 != nullptr);
+ noway_assert(op1->gtRegNum != REG_NA);
+
+ // !! NOTE !! genConsumeReg will clear op1 as GC ref after it has
+ // consumed a reg for the operand. This is because the variable
+ // is dead after return. But we are issuing more instructions
+ // like "profiler leave callback" after this consumption. So
+ // if you are issuing more instructions after this point,
+ // remember to keep the variable live up until the new method
+ // exit point where it is actually dead.
+ genConsumeReg(op1);
+
+ regNumber retReg = varTypeIsFloating(treeNode) ? REG_FLOATRET : REG_INTRET;
+#ifdef _TARGET_X86_
+ if (varTypeIsFloating(treeNode))
+ {
+ // Spill the return value register from an XMM register to the stack, then load it on the x87 stack.
+ // If it already has a home location, use that. Otherwise, we need a temp.
+ if (genIsRegCandidateLocal(op1) && compiler->lvaTable[op1->gtLclVarCommon.gtLclNum].lvOnFrame)
+ {
+ // Store local variable to its home location, if necessary.
+ if ((op1->gtFlags & GTF_REG_VAL) != 0)
+ {
+ op1->gtFlags &= ~GTF_REG_VAL;
+ inst_TT_RV(ins_Store(op1->gtType,
+ compiler->isSIMDTypeLocalAligned(op1->gtLclVarCommon.gtLclNum)),
+ op1, op1->gtRegNum);
+ }
+ // Now, load it to the fp stack.
+ getEmitter()->emitIns_S(INS_fld, emitTypeSize(op1), op1->AsLclVarCommon()->gtLclNum, 0);
+ }
+ else
+ {
+ // Spill the value, which should be in a register, then load it to the fp stack.
+ // TODO-X86-CQ: Deal with things that are already in memory (don't call genConsumeReg yet).
+ op1->gtFlags |= GTF_SPILL;
+ regSet.rsSpillTree(op1->gtRegNum, op1);
+ op1->gtFlags |= GTF_SPILLED;
+ op1->gtFlags &= ~GTF_SPILL;
+
+ TempDsc* t = regSet.rsUnspillInPlace(op1, op1->gtRegNum);
+ inst_FS_ST(INS_fld, emitActualTypeSize(op1->gtType), t, 0);
+ op1->gtFlags &= ~GTF_SPILLED;
+ compiler->tmpRlsTemp(t);
+ }
+ }
+ else
+#endif // _TARGET_X86_
+ {
+ if (op1->gtRegNum != retReg)
+ {
+ inst_RV_RV(ins_Copy(targetType), retReg, op1->gtRegNum, targetType);
+ }
+ }
+ }
+ }
+
+#ifdef PROFILING_SUPPORTED
+ // !! Note !!
+ // TODO-AMD64-Unix: If the profiler hook is implemented on *nix, make sure for 2 register returned structs
+ // the RAX and RDX needs to be kept alive. Make the necessary changes in lowerxarch.cpp
+ // in the handling of the GT_RETURN statement.
+ // Such structs containing GC pointers need to be handled by calling gcInfo.gcMarkRegSetNpt
+ // for the return registers containing GC refs.
+
+ // There will be a single return block while generating profiler ELT callbacks.
+ //
+ // Reason for not materializing Leave callback as a GT_PROF_HOOK node after GT_RETURN:
+ // In flowgraph and other places assert that the last node of a block marked as
+ // GT_RETURN is either a GT_RETURN or GT_JMP or a tail call. It would be nice to
+ // maintain such an invariant irrespective of whether profiler hook needed or not.
+ // Also, there is not much to be gained by materializing it as an explicit node.
+ if (compiler->compCurBB == compiler->genReturnBB)
+ {
+ // !! NOTE !!
+ // Since we are invalidating the assumption that we would slip into the epilog
+ // right after the "return", we need to preserve the return reg's GC state
+ // across the call until actual method return.
+ if (varTypeIsGC(compiler->info.compRetType))
+ {
+ gcInfo.gcMarkRegPtrVal(REG_INTRET, compiler->info.compRetType);
+ }
+
+ genProfilingLeaveCallback();
+
+ if (varTypeIsGC(compiler->info.compRetType))
+ {
+ gcInfo.gcMarkRegSetNpt(REG_INTRET);
+ }
+ }
+#endif
+}
+
+/*****************************************************************************
+ *
+ * Generate code for a single node in the tree.
+ * Preconditions: All operands have been evaluated
+ *
+ */
+void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
+{
+ regNumber targetReg;
+#if !defined(_TARGET_64BIT_)
+ if (treeNode->TypeGet() == TYP_LONG)
+ {
+ // All long enregistered nodes will have been decomposed into their
+ // constituent lo and hi nodes.
+ targetReg = REG_NA;
+ }
+ else
+#endif // !defined(_TARGET_64BIT_)
+ {
+ targetReg = treeNode->gtRegNum;
+ }
+ var_types targetType = treeNode->TypeGet();
+ emitter* emit = getEmitter();
+
+#ifdef DEBUG
+ // Validate that all the operands for the current node are consumed in order.
+ // This is important because LSRA ensures that any necessary copies will be
+ // handled correctly.
+ lastConsumedNode = nullptr;
+ if (compiler->verbose)
+ {
+ unsigned seqNum = treeNode->gtSeqNum; // Useful for setting a conditional break in Visual Studio
+ printf("Generating: ");
+ compiler->gtDispTree(treeNode, nullptr, nullptr, true);
+ }
+#endif // DEBUG
+
+ // Is this a node whose value is already in a register? LSRA denotes this by
+ // setting the GTF_REUSE_REG_VAL flag.
+ if (treeNode->IsReuseRegVal())
+ {
+ // For now, this is only used for constant nodes.
+ assert((treeNode->OperIsConst()));
+ JITDUMP(" TreeNode is marked ReuseReg\n");
+ return;
+ }
+
+ // contained nodes are part of their parents for codegen purposes
+ // ex : immediates, most LEAs
+ if (treeNode->isContained())
+ {
+ return;
+ }
+
+ switch (treeNode->gtOper)
+ {
+ case GT_START_NONGC:
+ getEmitter()->emitDisableGC();
+ break;
+
+ case GT_PROF_HOOK:
+#ifdef PROFILING_SUPPORTED
+ // We should be seeing this only if profiler hook is needed
+ noway_assert(compiler->compIsProfilerHookNeeded());
+
+ // Right now this node is used only for tail calls. In future if
+ // we intend to use it for Enter or Leave hooks, add a data member
+ // to this node indicating the kind of profiler hook. For example,
+ // helper number can be used.
+ genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL);
+#endif // PROFILING_SUPPORTED
+ break;
+
+ case GT_LCLHEAP:
+ genLclHeap(treeNode);
+ break;
+
+ case GT_CNS_INT:
+#ifdef _TARGET_X86_
+ NYI_IF(treeNode->IsIconHandle(GTF_ICON_TLS_HDL), "TLS constants");
+#endif // _TARGET_X86_
+ __fallthrough;
+
+ case GT_CNS_DBL:
+ genSetRegToConst(targetReg, targetType, treeNode);
+ genProduceReg(treeNode);
+ break;
+
+ case GT_NEG:
+ case GT_NOT:
+ if (varTypeIsFloating(targetType))
+ {
+ assert(treeNode->gtOper == GT_NEG);
+ genSSE2BitwiseOp(treeNode);
+ }
+ else
+ {
+ GenTreePtr operand = treeNode->gtGetOp1();
+ assert(!operand->isContained());
+ regNumber operandReg = genConsumeReg(operand);
+
+ if (operandReg != targetReg)
+ {
+ inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
+ }
+
+ instruction ins = genGetInsForOper(treeNode->OperGet(), targetType);
+ inst_RV(ins, targetReg, targetType);
+ }
+ genProduceReg(treeNode);
+ break;
+
+ case GT_OR:
+ case GT_XOR:
+ case GT_AND:
+ assert(varTypeIsIntegralOrI(treeNode));
+ __fallthrough;
+
+#if !defined(_TARGET_64BIT_)
+ case GT_ADD_LO:
+ case GT_ADD_HI:
+ case GT_SUB_LO:
+ case GT_SUB_HI:
+#endif // !defined(_TARGET_64BIT_)
+ case GT_ADD:
+ case GT_SUB:
+ genConsumeOperands(treeNode->AsOp());
+ genCodeForBinary(treeNode);
+ break;
+
+ case GT_LSH:
+ case GT_RSH:
+ case GT_RSZ:
+ case GT_ROL:
+ case GT_ROR:
+ genCodeForShift(treeNode);
+ // genCodeForShift() calls genProduceReg()
+ break;
+
+ case GT_CAST:
+#if !defined(_TARGET_64BIT_)
+ // We will NYI in DecomposeNode() if we are cast TO a long type, but we do not
+ // yet support casting FROM a long type either, and that's simpler to catch
+ // here.
+ NYI_IF(varTypeIsLong(treeNode->gtOp.gtOp1), "Casts from TYP_LONG");
+#endif // !defined(_TARGET_64BIT_)
+
+ if (varTypeIsFloating(targetType) && varTypeIsFloating(treeNode->gtOp.gtOp1))
+ {
+ // Casts float/double <--> double/float
+ genFloatToFloatCast(treeNode);
+ }
+ else if (varTypeIsFloating(treeNode->gtOp.gtOp1))
+ {
+ // Casts float/double --> int32/int64
+ genFloatToIntCast(treeNode);
+ }
+ else if (varTypeIsFloating(targetType))
+ {
+ // Casts int32/uint32/int64/uint64 --> float/double
+ genIntToFloatCast(treeNode);
+ }
+ else
+ {
+ // Casts int <--> int
+ genIntToIntCast(treeNode);
+ }
+ // The per-case functions call genProduceReg()
+ break;
+
+ case GT_LCL_VAR:
+ {
+ // lcl_vars are not defs
+ assert((treeNode->gtFlags & GTF_VAR_DEF) == 0);
+
+ GenTreeLclVarCommon* lcl = treeNode->AsLclVarCommon();
+ bool isRegCandidate = compiler->lvaTable[lcl->gtLclNum].lvIsRegCandidate();
+
+ if (isRegCandidate && !(treeNode->gtFlags & GTF_VAR_DEATH))
+ {
+ assert((treeNode->InReg()) || (treeNode->gtFlags & GTF_SPILLED));
+ }
+
+ // If this is a register candidate that has been spilled, genConsumeReg() will
+ // reload it at the point of use. Otherwise, if it's not in a register, we load it here.
+
+ if (!treeNode->InReg() && !(treeNode->gtFlags & GTF_SPILLED))
+ {
+ assert(!isRegCandidate);
+
+ emit->emitIns_R_S(ins_Load(treeNode->TypeGet(), compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)),
+ emitTypeSize(treeNode), treeNode->gtRegNum, lcl->gtLclNum, 0);
+ genProduceReg(treeNode);
+ }
+ }
+ break;
+
+ case GT_LCL_FLD_ADDR:
+ case GT_LCL_VAR_ADDR:
+ // Address of a local var. This by itself should never be allocated a register.
+ // If it is worth storing the address in a register then it should be cse'ed into
+ // a temp and that would be allocated a register.
+ noway_assert(targetType == TYP_BYREF);
+ noway_assert(!treeNode->InReg());
+
+ inst_RV_TT(INS_lea, targetReg, treeNode, 0, EA_BYREF);
+ genProduceReg(treeNode);
+ break;
+
+ case GT_LCL_FLD:
+ {
+ noway_assert(targetType != TYP_STRUCT);
+ noway_assert(treeNode->gtRegNum != REG_NA);
+
+#ifdef FEATURE_SIMD
+ // Loading of TYP_SIMD12 (i.e. Vector3) field
+ if (treeNode->TypeGet() == TYP_SIMD12)
+ {
+ genLoadLclFldTypeSIMD12(treeNode);
+ break;
+ }
+#endif
+
+ emitAttr size = emitTypeSize(targetType);
+ unsigned offs = treeNode->gtLclFld.gtLclOffs;
+ unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
+ assert(varNum < compiler->lvaCount);
+
+ emit->emitIns_R_S(ins_Move_Extend(targetType, treeNode->InReg()), size, targetReg, varNum, offs);
+ }
+ genProduceReg(treeNode);
+ break;
+
+ case GT_STORE_LCL_FLD:
+ {
+ noway_assert(targetType != TYP_STRUCT);
+ noway_assert(!treeNode->InReg());
+ assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+
+#ifdef FEATURE_SIMD
+ // storing of TYP_SIMD12 (i.e. Vector3) field
+ if (treeNode->TypeGet() == TYP_SIMD12)
+ {
+ genStoreLclFldTypeSIMD12(treeNode);
+ break;
+ }
+#endif
+ GenTreePtr op1 = treeNode->gtGetOp1();
+ genConsumeRegs(op1);
+ emit->emitInsBinary(ins_Store(targetType), emitTypeSize(treeNode), treeNode, op1);
+ }
+ break;
+
+ case GT_STORE_LCL_VAR:
+ {
+ GenTreePtr op1 = treeNode->gtGetOp1();
+
+ // var = call, where call returns a multi-reg return value
+ // case is handled separately.
+ if (op1->gtSkipReloadOrCopy()->IsMultiRegCall())
+ {
+ genMultiRegCallStoreToLocal(treeNode);
+ }
+ else
+ {
+ noway_assert(targetType != TYP_STRUCT);
+ assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+
+ unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
+ LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
+
+ // Ensure that lclVar nodes are typed correctly.
+ assert(!varDsc->lvNormalizeOnStore() || treeNode->TypeGet() == genActualType(varDsc->TypeGet()));
+
+#if !defined(_TARGET_64BIT_)
+ if (treeNode->TypeGet() == TYP_LONG)
+ {
+ genStoreLongLclVar(treeNode);
+ break;
+ }
+#endif // !defined(_TARGET_64BIT_)
+
+#ifdef FEATURE_SIMD
+ if (varTypeIsSIMD(targetType) && (targetReg != REG_NA) && op1->IsCnsIntOrI())
+ {
+ // This is only possible for a zero-init.
+ noway_assert(op1->IsIntegralConst(0));
+ genSIMDZero(targetType, varDsc->lvBaseType, targetReg);
+ genProduceReg(treeNode);
+ break;
+ }
+#endif // FEATURE_SIMD
+
+ genConsumeRegs(op1);
+
+ if (treeNode->gtRegNum == REG_NA)
+ {
+ // stack store
+ emit->emitInsMov(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)),
+ emitTypeSize(targetType), treeNode);
+ varDsc->lvRegNum = REG_STK;
+ }
+ else
+ {
+ bool containedOp1 = op1->isContained();
+ // Look for the case where we have a constant zero which we've marked for reuse,
+ // but which isn't actually in the register we want. In that case, it's better to create
+ // zero in the target register, because an xor is smaller than a copy. Note that we could
+ // potentially handle this in the register allocator, but we can't always catch it there
+ // because the target may not have a register allocated for it yet.
+ if (!containedOp1 && (op1->gtRegNum != treeNode->gtRegNum) &&
+ (op1->IsIntegralConst(0) || op1->IsFPZero()))
+ {
+ op1->gtRegNum = REG_NA;
+ op1->ResetReuseRegVal();
+ containedOp1 = true;
+ }
+
+ if (containedOp1)
+ {
+ // Currently, we assume that the contained source of a GT_STORE_LCL_VAR writing to a register
+ // must be a constant. However, in the future we might want to support a contained memory op.
+ // This is a bit tricky because we have to decide it's contained before register allocation,
+ // and this would be a case where, once that's done, we need to mark that node as always
+ // requiring a register - which we always assume now anyway, but once we "optimize" that
+ // we'll have to take cases like this into account.
+ assert((op1->gtRegNum == REG_NA) && op1->OperIsConst());
+ genSetRegToConst(treeNode->gtRegNum, targetType, op1);
+ }
+ else if (op1->gtRegNum != treeNode->gtRegNum)
+ {
+ assert(op1->gtRegNum != REG_NA);
+ emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(treeNode), treeNode, op1);
+ }
+ }
+ }
+
+ if (treeNode->gtRegNum != REG_NA)
+ {
+ genProduceReg(treeNode);
+ }
+ }
+ break;
+
+ case GT_RETFILT:
+ // A void GT_RETFILT is the end of a finally. For non-void filter returns we need to load the result in
+ // the return register, if it's not already there. The processing is the same as GT_RETURN.
+ if (targetType != TYP_VOID)
+ {
+ // For filters, the IL spec says the result is type int32. Further, the only specified legal values
+ // are 0 or 1, with the use of other values "undefined".
+ assert(targetType == TYP_INT);
+ }
+
+ __fallthrough;
+
+ case GT_RETURN:
+ genReturn(treeNode);
+ break;
+
+ case GT_LEA:
+ {
+ // if we are here, it is the case where there is an LEA that cannot
+ // be folded into a parent instruction
+ GenTreeAddrMode* lea = treeNode->AsAddrMode();
+ genLeaInstruction(lea);
+ }
+ // genLeaInstruction calls genProduceReg()
+ break;
+
+ case GT_IND:
+#ifdef FEATURE_SIMD
+ // Handling of Vector3 type values loaded through indirection.
+ if (treeNode->TypeGet() == TYP_SIMD12)
+ {
+ genLoadIndTypeSIMD12(treeNode);
+ break;
+ }
+#endif // FEATURE_SIMD
+
+ genConsumeAddress(treeNode->AsIndir()->Addr());
+ emit->emitInsMov(ins_Load(treeNode->TypeGet()), emitTypeSize(treeNode), treeNode);
+ genProduceReg(treeNode);
+ break;
+
+ case GT_MULHI:
+ genCodeForMulHi(treeNode->AsOp());
+ genProduceReg(treeNode);
+ break;
+
+ case GT_MUL:
+ {
+ instruction ins;
+ emitAttr size = emitTypeSize(treeNode);
+ bool isUnsignedMultiply = ((treeNode->gtFlags & GTF_UNSIGNED) != 0);
+ bool requiresOverflowCheck = treeNode->gtOverflowEx();
+
+ GenTree* op1 = treeNode->gtGetOp1();
+ GenTree* op2 = treeNode->gtGetOp2();
+
+ // there are 3 forms of x64 multiply:
+ // 1-op form with 128 result: RDX:RAX = RAX * rm
+ // 2-op form: reg *= rm
+ // 3-op form: reg = rm * imm
+
+ genConsumeOperands(treeNode->AsOp());
+
+ // This matches the 'mul' lowering in Lowering::SetMulOpCounts()
+ //
+ // immOp :: Only one operand can be an immediate
+ // rmOp :: Only one operand can be a memory op.
+ // regOp :: A register op (especially the operand that matches 'targetReg')
+ // (can be nullptr when we have both a memory op and an immediate op)
+
+ GenTree* immOp = nullptr;
+ GenTree* rmOp = op1;
+ GenTree* regOp;
+
+ if (op2->isContainedIntOrIImmed())
+ {
+ immOp = op2;
+ }
+ else if (op1->isContainedIntOrIImmed())
+ {
+ immOp = op1;
+ rmOp = op2;
+ }
+
+ if (immOp != nullptr)
+ {
+ // This must be a non-floating point operation.
+ assert(!varTypeIsFloating(treeNode));
+
+ // CQ: When possible use LEA for mul by imm 3, 5 or 9
+ ssize_t imm = immOp->AsIntConCommon()->IconValue();
+
+ if (!requiresOverflowCheck && !rmOp->isContained() && ((imm == 3) || (imm == 5) || (imm == 9)))
+ {
+ // We will use the LEA instruction to perform this multiply
+ // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9.
+ unsigned int scale = (unsigned int)(imm - 1);
+ getEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->gtRegNum, rmOp->gtRegNum, scale, 0);
+ }
+ else
+ {
+ // use the 3-op form with immediate
+ ins = getEmitter()->inst3opImulForReg(targetReg);
+ emit->emitInsBinary(ins, size, rmOp, immOp);
+ }
+ }
+ else // we have no contained immediate operand
+ {
+ regOp = op1;
+ rmOp = op2;
+
+ regNumber mulTargetReg = targetReg;
+ if (isUnsignedMultiply && requiresOverflowCheck)
+ {
+ ins = INS_mulEAX;
+ mulTargetReg = REG_RAX;
+ }
+ else
+ {
+ ins = genGetInsForOper(GT_MUL, targetType);
+ }
+
+ // Set rmOp to the contain memory operand (if any)
+ // or set regOp to the op2 when it has the matching target register for our multiply op
+ //
+ if (op1->isContained() || (!op2->isContained() && (op2->gtRegNum == mulTargetReg)))
+ {
+ regOp = op2;
+ rmOp = op1;
+ }
+ assert(!regOp->isContained());
+
+ // Setup targetReg when neither of the source operands was a matching register
+ if (regOp->gtRegNum != mulTargetReg)
+ {
+ inst_RV_RV(ins_Copy(targetType), mulTargetReg, regOp->gtRegNum, targetType);
+ }
+
+ emit->emitInsBinary(ins, size, treeNode, rmOp);
+
+ // Move the result to the desired register, if necessary
+ if ((ins == INS_mulEAX) && (targetReg != REG_RAX))
+ {
+ inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
+ }
+ }
+
+ if (requiresOverflowCheck)
+ {
+ // Overflow checking is only used for non-floating point types
+ noway_assert(!varTypeIsFloating(treeNode));
+
+ genCheckOverflow(treeNode);
+ }
+ }
+ genProduceReg(treeNode);
+ break;
+
+ case GT_MOD:
+ case GT_UDIV:
+ case GT_UMOD:
+ // We shouldn't be seeing GT_MOD on float/double args as it should get morphed into a
+ // helper call by front-end. Similarly we shouldn't be seeing GT_UDIV and GT_UMOD
+ // on float/double args.
+ noway_assert(!varTypeIsFloating(treeNode));
+ __fallthrough;
+
+ case GT_DIV:
+ genCodeForDivMod(treeNode->AsOp());
+ break;
+
+ case GT_INTRINSIC:
+ genIntrinsic(treeNode);
+ break;
+
+#ifdef FEATURE_SIMD
+ case GT_SIMD:
+ genSIMDIntrinsic(treeNode->AsSIMD());
+ break;
+#endif // FEATURE_SIMD
+
+ case GT_CKFINITE:
+ genCkfinite(treeNode);
+ break;
+
+ case GT_EQ:
+ case GT_NE:
+ case GT_LT:
+ case GT_LE:
+ case GT_GE:
+ case GT_GT:
+ {
+ // TODO-XArch-CQ: Check if we can use the currently set flags.
+ // TODO-XArch-CQ: Check for the case where we can simply transfer the carry bit to a register
+ // (signed < or >= where targetReg != REG_NA)
+
+ GenTreePtr op1 = treeNode->gtGetOp1();
+ var_types op1Type = op1->TypeGet();
+
+ if (varTypeIsFloating(op1Type))
+ {
+ genCompareFloat(treeNode);
+ }
+#if !defined(_TARGET_64BIT_)
+ // X86 Long comparison
+ else if (varTypeIsLong(op1Type))
+ {
+ // When not materializing the result in a register, the compare logic is generated
+ // when we generate the GT_JTRUE.
+ if (treeNode->gtRegNum != REG_NA)
+ {
+ genCompareLong(treeNode);
+ }
+ else
+ {
+ // We generate the compare when we generate the GT_JTRUE, but we need to consume
+ // the operands now.
+ genConsumeOperands(treeNode->AsOp());
+ }
+ }
+#endif // !defined(_TARGET_64BIT_)
+ else
+ {
+ genCompareInt(treeNode);
+ }
+ }
+ break;
+
+ case GT_JTRUE:
+ {
+ GenTree* cmp = treeNode->gtOp.gtOp1;
+
+ assert(cmp->OperIsCompare());
+ assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
+
+#if !defined(_TARGET_64BIT_)
+ // For long compares, we emit special logic
+ if (varTypeIsLong(cmp->gtGetOp1()))
+ {
+ genJTrueLong(cmp);
+ }
+ else
+#endif
+ {
+ // Get the "kind" and type of the comparison. Note that whether it is an unsigned cmp
+ // is governed by a flag NOT by the inherent type of the node
+ // TODO-XArch-CQ: Check if we can use the currently set flags.
+ emitJumpKind jumpKind[2];
+ bool branchToTrueLabel[2];
+ genJumpKindsForTree(cmp, jumpKind, branchToTrueLabel);
+
+ BasicBlock* skipLabel = nullptr;
+ if (jumpKind[0] != EJ_NONE)
+ {
+ BasicBlock* jmpTarget;
+ if (branchToTrueLabel[0])
+ {
+ jmpTarget = compiler->compCurBB->bbJumpDest;
+ }
+ else
+ {
+ // This case arises only for ordered GT_EQ right now
+ assert((cmp->gtOper == GT_EQ) && ((cmp->gtFlags & GTF_RELOP_NAN_UN) == 0));
+ skipLabel = genCreateTempLabel();
+ jmpTarget = skipLabel;
+ }
+
+ inst_JMP(jumpKind[0], jmpTarget);
+ }
+
+ if (jumpKind[1] != EJ_NONE)
+ {
+ // the second conditional branch always has to be to the true label
+ assert(branchToTrueLabel[1]);
+ inst_JMP(jumpKind[1], compiler->compCurBB->bbJumpDest);
+ }
+
+ if (skipLabel != nullptr)
+ {
+ genDefineTempLabel(skipLabel);
+ }
+ }
+ }
+ break;
+
+ case GT_RETURNTRAP:
+ {
+ // this is nothing but a conditional call to CORINFO_HELP_STOP_FOR_GC
+ // based on the contents of 'data'
+
+ GenTree* data = treeNode->gtOp.gtOp1;
+ genConsumeRegs(data);
+ GenTreeIntCon cns = intForm(TYP_INT, 0);
+ emit->emitInsBinary(INS_cmp, emitTypeSize(TYP_INT), data, &cns);
+
+ BasicBlock* skipLabel = genCreateTempLabel();
+
+ emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
+ inst_JMP(jmpEqual, skipLabel);
+
+ // emit the call to the EE-helper that stops for GC (or other reasons)
+ assert(treeNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+ regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+ assert(genIsValidIntReg(tmpReg));
+
+ genEmitHelperCall(CORINFO_HELP_STOP_FOR_GC, 0, EA_UNKNOWN, tmpReg);
+ genDefineTempLabel(skipLabel);
+ }
+ break;
+
+ case GT_STOREIND:
+ genStoreInd(treeNode);
+ break;
+
+ case GT_COPY:
+ // This is handled at the time we call genConsumeReg() on the GT_COPY
+ break;
+
+ case GT_SWAP:
+ {
+ // Swap is only supported for lclVar operands that are enregistered
+ // We do not consume or produce any registers. Both operands remain enregistered.
+ // However, the gc-ness may change.
+ assert(genIsRegCandidateLocal(treeNode->gtOp.gtOp1) && genIsRegCandidateLocal(treeNode->gtOp.gtOp2));
+
+ GenTreeLclVarCommon* lcl1 = treeNode->gtOp.gtOp1->AsLclVarCommon();
+ LclVarDsc* varDsc1 = &(compiler->lvaTable[lcl1->gtLclNum]);
+ var_types type1 = varDsc1->TypeGet();
+ GenTreeLclVarCommon* lcl2 = treeNode->gtOp.gtOp2->AsLclVarCommon();
+ LclVarDsc* varDsc2 = &(compiler->lvaTable[lcl2->gtLclNum]);
+ var_types type2 = varDsc2->TypeGet();
+
+ // We must have both int or both fp regs
+ assert(!varTypeIsFloating(type1) || varTypeIsFloating(type2));
+
+ // FP swap is not yet implemented (and should have NYI'd in LSRA)
+ assert(!varTypeIsFloating(type1));
+
+ regNumber oldOp1Reg = lcl1->gtRegNum;
+ regMaskTP oldOp1RegMask = genRegMask(oldOp1Reg);
+ regNumber oldOp2Reg = lcl2->gtRegNum;
+ regMaskTP oldOp2RegMask = genRegMask(oldOp2Reg);
+
+ // We don't call genUpdateVarReg because we don't have a tree node with the new register.
+ varDsc1->lvRegNum = oldOp2Reg;
+ varDsc2->lvRegNum = oldOp1Reg;
+
+ // Do the xchg
+ emitAttr size = EA_PTRSIZE;
+ if (varTypeGCtype(type1) != varTypeGCtype(type2))
+ {
+ // If the type specified to the emitter is a GC type, it will swap the GC-ness of the registers.
+ // Otherwise it will leave them alone, which is correct if they have the same GC-ness.
+ size = EA_GCREF;
+ }
+ inst_RV_RV(INS_xchg, oldOp1Reg, oldOp2Reg, TYP_I_IMPL, size);
+
+ // Update the gcInfo.
+ // Manually remove these regs for the gc sets (mostly to avoid confusing duplicative dump output)
+ gcInfo.gcRegByrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
+ gcInfo.gcRegGCrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
+
+ // gcMarkRegPtrVal will do the appropriate thing for non-gc types.
+ // It will also dump the updates.
+ gcInfo.gcMarkRegPtrVal(oldOp2Reg, type1);
+ gcInfo.gcMarkRegPtrVal(oldOp1Reg, type2);
+ }
+ break;
+
+ case GT_LIST:
+ case GT_ARGPLACE:
+ // Nothing to do
+ break;
+
+ case GT_PUTARG_STK:
+ genPutArgStk(treeNode);
+ break;
+
+ case GT_PUTARG_REG:
+ {
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
+ noway_assert(targetType != TYP_STRUCT);
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+ // commas show up here commonly, as part of a nullchk operation
+ GenTree* op1 = treeNode->gtOp.gtOp1;
+ // If child node is not already in the register we need, move it
+ genConsumeReg(op1);
+ if (treeNode->gtRegNum != op1->gtRegNum)
+ {
+ inst_RV_RV(ins_Copy(targetType), treeNode->gtRegNum, op1->gtRegNum, targetType);
+ }
+ genProduceReg(treeNode);
+ }
+ break;
+
+ case GT_CALL:
+ genCallInstruction(treeNode);
+ break;
+
+ case GT_JMP:
+ genJmpMethod(treeNode);
+ break;
+
+ case GT_LOCKADD:
+ case GT_XCHG:
+ case GT_XADD:
+ genLockedInstructions(treeNode);
+ break;
+
+ case GT_MEMORYBARRIER:
+ instGen_MemoryBarrier();
+ break;
+
+ case GT_CMPXCHG:
+ {
+ GenTreePtr location = treeNode->gtCmpXchg.gtOpLocation; // arg1
+ GenTreePtr value = treeNode->gtCmpXchg.gtOpValue; // arg2
+ GenTreePtr comparand = treeNode->gtCmpXchg.gtOpComparand; // arg3
+
+ assert(location->gtRegNum != REG_NA && location->gtRegNum != REG_RAX);
+ assert(value->gtRegNum != REG_NA && value->gtRegNum != REG_RAX);
+
+ genConsumeReg(location);
+ genConsumeReg(value);
+ genConsumeReg(comparand);
+ // comparand goes to RAX;
+ // Note that we must issue this move after the genConsumeRegs(), in case any of the above
+ // have a GT_COPY from RAX.
+ if (comparand->gtRegNum != REG_RAX)
+ {
+ inst_RV_RV(ins_Copy(comparand->TypeGet()), REG_RAX, comparand->gtRegNum, comparand->TypeGet());
+ }
+
+ // location is Rm
+ instGen(INS_lock);
+
+ emit->emitIns_AR_R(INS_cmpxchg, emitTypeSize(targetType), value->gtRegNum, location->gtRegNum, 0);
+
+ // Result is in RAX
+ if (targetReg != REG_RAX)
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, REG_RAX, targetType);
+ }
+ }
+ genProduceReg(treeNode);
+ break;
+
+ case GT_RELOAD:
+ // do nothing - reload is just a marker.
+ // The parent node will call genConsumeReg on this which will trigger the unspill of this node's child
+ // into the register specified in this node.
+ break;
+
+ case GT_NOP:
+ break;
+
+ case GT_NO_OP:
+ if (treeNode->gtFlags & GTF_NO_OP_NO)
+ {
+ noway_assert(!"GTF_NO_OP_NO should not be set");
+ }
+ else
+ {
+ getEmitter()->emitIns_Nop(1);
+ }
+ break;
+
+ case GT_ARR_BOUNDS_CHECK:
+#ifdef FEATURE_SIMD
+ case GT_SIMD_CHK:
+#endif // FEATURE_SIMD
+ genRangeCheck(treeNode);
+ break;
+
+ case GT_PHYSREG:
+ if (treeNode->gtRegNum != treeNode->AsPhysReg()->gtSrcReg)
+ {
+ inst_RV_RV(INS_mov, treeNode->gtRegNum, treeNode->AsPhysReg()->gtSrcReg, targetType);
+
+ genTransferRegGCState(treeNode->gtRegNum, treeNode->AsPhysReg()->gtSrcReg);
+ }
+ genProduceReg(treeNode);
+ break;
+
+ case GT_PHYSREGDST:
+ break;
+
+ case GT_NULLCHECK:
+ {
+ assert(!treeNode->gtOp.gtOp1->isContained());
+ regNumber reg = genConsumeReg(treeNode->gtOp.gtOp1);
+ emit->emitIns_AR_R(INS_cmp, EA_4BYTE, reg, reg, 0);
+ }
+ break;
+
+ case GT_CATCH_ARG:
+
+ noway_assert(handlerGetsXcptnObj(compiler->compCurBB->bbCatchTyp));
+
+ /* Catch arguments get passed in a register. genCodeForBBlist()
+ would have marked it as holding a GC object, but not used. */
+
+ noway_assert(gcInfo.gcRegGCrefSetCur & RBM_EXCEPTION_OBJECT);
+ genConsumeReg(treeNode);
+ break;
+
+#if !FEATURE_EH_FUNCLETS
+ case GT_END_LFIN:
+
+ // Have to clear the ShadowSP of the nesting level which encloses the finally. Generates:
+ // mov dword ptr [ebp-0xC], 0 // for some slot of the ShadowSP local var
+
+ unsigned finallyNesting;
+ finallyNesting = treeNode->gtVal.gtVal1;
+ noway_assert(treeNode->gtVal.gtVal1 < compiler->compHndBBtabCount);
+ noway_assert(finallyNesting < compiler->compHndBBtabCount);
+
+ // The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
+ unsigned filterEndOffsetSlotOffs;
+ PREFIX_ASSUME(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) >
+ TARGET_POINTER_SIZE); // below doesn't underflow.
+ filterEndOffsetSlotOffs =
+ (unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
+
+ unsigned curNestingSlotOffs;
+ curNestingSlotOffs = filterEndOffsetSlotOffs - ((finallyNesting + 1) * TARGET_POINTER_SIZE);
+ instGen_Store_Imm_Into_Lcl(TYP_I_IMPL, EA_PTRSIZE, 0, compiler->lvaShadowSPslotsVar, curNestingSlotOffs);
+ break;
+#endif // !FEATURE_EH_FUNCLETS
+
+ case GT_PINVOKE_PROLOG:
+ noway_assert(((gcInfo.gcRegGCrefSetCur | gcInfo.gcRegByrefSetCur) & ~fullIntArgRegMask()) == 0);
+
+ // the runtime side requires the codegen here to be consistent
+ emit->emitDisableRandomNops();
+ break;
+
+ case GT_LABEL:
+ genPendingCallLabel = genCreateTempLabel();
+ treeNode->gtLabel.gtLabBB = genPendingCallLabel;
+ emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, genPendingCallLabel, treeNode->gtRegNum);
+ break;
+
+ case GT_STORE_OBJ:
+ if (treeNode->OperIsCopyBlkOp() && !treeNode->AsBlk()->gtBlkOpGcUnsafe)
+ {
+ assert(treeNode->AsObj()->gtGcPtrCount != 0);
+ genCodeForCpObj(treeNode->AsObj());
+ break;
+ }
+ __fallthrough;
+
+ case GT_STORE_DYN_BLK:
+ case GT_STORE_BLK:
+ genCodeForStoreBlk(treeNode->AsBlk());
+ break;
+
+ case GT_JMPTABLE:
+ genJumpTable(treeNode);
+ break;
+
+ case GT_SWITCH_TABLE:
+ genTableBasedSwitch(treeNode);
+ break;
+
+ case GT_ARR_INDEX:
+ genCodeForArrIndex(treeNode->AsArrIndex());
+ break;
+
+ case GT_ARR_OFFSET:
+ genCodeForArrOffset(treeNode->AsArrOffs());
+ break;
+
+ case GT_CLS_VAR_ADDR:
+ getEmitter()->emitIns_R_C(INS_lea, EA_PTRSIZE, targetReg, treeNode->gtClsVar.gtClsVarHnd, 0);
+ genProduceReg(treeNode);
+ break;
+
+#if !defined(_TARGET_64BIT_)
+ case GT_LONG:
+ assert(!treeNode->isContained());
+ genConsumeRegs(treeNode);
+ break;
+#endif
+
+ case GT_IL_OFFSET:
+ // Do nothing; these nodes are simply markers for debug info.
+ break;
+
+ default:
+ {
+#ifdef DEBUG
+ char message[256];
+ sprintf(message, "Unimplemented node type %s\n", GenTree::NodeName(treeNode->OperGet()));
+#endif
+ assert(!"Unknown node in codegen");
+ }
+ break;
+ }
+}
+
+//----------------------------------------------------------------------------------
+// genMultiRegCallStoreToLocal: store multi-reg return value of a call node to a local
+//
+// Arguments:
+// treeNode - Gentree of GT_STORE_LCL_VAR
+//
+// Return Value:
+// None
+//
+// Assumption:
+// The child of store is a multi-reg call node.
+// genProduceReg() on treeNode is made by caller of this routine.
+//
+void CodeGen::genMultiRegCallStoreToLocal(GenTreePtr treeNode)
+{
+ assert(treeNode->OperGet() == GT_STORE_LCL_VAR);
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+ // Structs of size >=9 and <=16 are returned in two return registers on x64 Unix.
+ assert(varTypeIsStruct(treeNode));
+
+ // Assumption: current x64 Unix implementation requires that a multi-reg struct
+ // var in 'var = call' is flagged as lvIsMultiRegRet to prevent it from
+ // being struct promoted.
+ unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
+ LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
+ noway_assert(varDsc->lvIsMultiRegRet);
+
+ GenTree* op1 = treeNode->gtGetOp1();
+ GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
+ GenTreeCall* call = actualOp1->AsCall();
+ assert(call->HasMultiRegRetVal());
+
+ genConsumeRegs(op1);
+
+ ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+ assert(retTypeDesc->GetReturnRegCount() == MAX_RET_REG_COUNT);
+ unsigned regCount = retTypeDesc->GetReturnRegCount();
+
+ if (treeNode->gtRegNum != REG_NA)
+ {
+ // Right now the only enregistrable structs supported are SIMD types.
+ assert(varTypeIsSIMD(treeNode));
+ assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(0)));
+ assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(1)));
+
+ // This is a case of two 8-bytes that comprise the operand is in
+ // two different xmm registers and needs to assembled into a single
+ // xmm register.
+ regNumber targetReg = treeNode->gtRegNum;
+ regNumber reg0 = call->GetRegNumByIdx(0);
+ regNumber reg1 = call->GetRegNumByIdx(1);
+
+ if (op1->IsCopyOrReload())
+ {
+ // GT_COPY/GT_RELOAD will have valid reg for those positions
+ // that need to be copied or reloaded.
+ regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(0);
+ if (reloadReg != REG_NA)
+ {
+ reg0 = reloadReg;
+ }
+
+ reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(1);
+ if (reloadReg != REG_NA)
+ {
+ reg1 = reloadReg;
+ }
+ }
+
+ if (targetReg != reg0 && targetReg != reg1)
+ {
+ // Copy reg0 into targetReg and let it to be handled by one
+ // of the cases below.
+ inst_RV_RV(ins_Copy(TYP_DOUBLE), targetReg, reg0, TYP_DOUBLE);
+ targetReg = reg0;
+ }
+
+ if (targetReg == reg0)
+ {
+ // targeReg[63:0] = targetReg[63:0]
+ // targetReg[127:64] = reg1[127:64]
+ inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg1, 0x00);
+ }
+ else
+ {
+ assert(targetReg == reg1);
+
+ // We need two shuffles to achieve this
+ // First:
+ // targeReg[63:0] = targetReg[63:0]
+ // targetReg[127:64] = reg0[63:0]
+ //
+ // Second:
+ // targeReg[63:0] = targetReg[127:64]
+ // targetReg[127:64] = targetReg[63:0]
+ //
+ // Essentially copy low 8-bytes from reg0 to high 8-bytes of targetReg
+ // and next swap low and high 8-bytes of targetReg to have them
+ // rearranged in the right order.
+ inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg0, 0x00);
+ inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, targetReg, 0x01);
+ }
+ }
+ else
+ {
+ // Stack store
+ int offset = 0;
+ for (unsigned i = 0; i < regCount; ++i)
+ {
+ var_types type = retTypeDesc->GetReturnRegType(i);
+ regNumber reg = call->GetRegNumByIdx(i);
+ if (op1->IsCopyOrReload())
+ {
+ // GT_COPY/GT_RELOAD will have valid reg for those positions
+ // that need to be copied or reloaded.
+ regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(i);
+ if (reloadReg != REG_NA)
+ {
+ reg = reloadReg;
+ }
+ }
+
+ assert(reg != REG_NA);
+ getEmitter()->emitIns_S_R(ins_Store(type), emitTypeSize(type), reg, lclNum, offset);
+ offset += genTypeSize(type);
+ }
+
+ varDsc->lvRegNum = REG_STK;
+ }
+#elif defined(_TARGET_X86_)
+ // Longs are returned in two return registers on x86.
+ assert(varTypeIsLong(treeNode));
+
+ // Assumption: current x86 implementation requires that a multi-reg long
+ // var in 'var = call' is flagged as lvIsMultiRegRet to prevent it from
+ // being promoted.
+ unsigned lclNum = treeNode->AsLclVarCommon()->gtLclNum;
+ LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
+ noway_assert(varDsc->lvIsMultiRegRet);
+
+ GenTree* op1 = treeNode->gtGetOp1();
+ GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
+ GenTreeCall* call = actualOp1->AsCall();
+ assert(call->HasMultiRegRetVal());
+
+ genConsumeRegs(op1);
+
+ ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+ unsigned regCount = retTypeDesc->GetReturnRegCount();
+ assert(regCount == MAX_RET_REG_COUNT);
+
+ // Stack store
+ int offset = 0;
+ for (unsigned i = 0; i < regCount; ++i)
+ {
+ var_types type = retTypeDesc->GetReturnRegType(i);
+ regNumber reg = call->GetRegNumByIdx(i);
+ if (op1->IsCopyOrReload())
+ {
+ // GT_COPY/GT_RELOAD will have valid reg for those positions
+ // that need to be copied or reloaded.
+ regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(i);
+ if (reloadReg != REG_NA)
+ {
+ reg = reloadReg;
+ }
+ }
+
+ assert(reg != REG_NA);
+ getEmitter()->emitIns_S_R(ins_Store(type), emitTypeSize(type), reg, lclNum, offset);
+ offset += genTypeSize(type);
+ }
+
+ varDsc->lvRegNum = REG_STK;
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING && !_TARGET_X86_
+ assert(!"Unreached");
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING && !_TARGET_X86_
+}
+
+//------------------------------------------------------------------------
+// genLclHeap: Generate code for localloc.
+//
+// Arguments:
+// tree - the localloc tree to generate.
+//
+// Notes:
+// Note that for x86, we don't track ESP movements while generating the localloc code.
+// The ESP tracking is used to report stack pointer-relative GC info, which is not
+// interesting while doing the localloc construction. Also, for functions with localloc,
+// we have EBP frames, and EBP-relative locals, and ESP-relative accesses only for function
+// call arguments. We store the ESP after the localloc is complete in the LocAllocSP
+// variable. This variable is implicitly reported to the VM in the GC info (its position
+// is defined by convention relative to other items), and is used by the GC to find the
+// "base" stack pointer in functions with localloc.
+//
+void CodeGen::genLclHeap(GenTreePtr tree)
+{
+ assert(tree->OperGet() == GT_LCLHEAP);
+ assert(compiler->compLocallocUsed);
+
+ GenTreePtr size = tree->gtOp.gtOp1;
+ noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL));
+
+ regNumber targetReg = tree->gtRegNum;
+ regMaskTP tmpRegsMask = tree->gtRsvdRegs;
+ regNumber regCnt = REG_NA;
+ var_types type = genActualType(size->gtType);
+ emitAttr easz = emitTypeSize(type);
+ BasicBlock* endLabel = nullptr;
+
+#ifdef DEBUG
+ // Verify ESP
+ if (compiler->opts.compStackCheckOnRet)
+ {
+ noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC &&
+ compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister &&
+ compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame);
+ getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnEspCheck, 0);
+
+ BasicBlock* esp_check = genCreateTempLabel();
+ emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
+ inst_JMP(jmpEqual, esp_check);
+ getEmitter()->emitIns(INS_BREAKPOINT);
+ genDefineTempLabel(esp_check);
+ }
+#endif
+
+ noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes
+ noway_assert(genStackLevel == 0); // Can't have anything on the stack
+
+ unsigned stackAdjustment = 0;
+ BasicBlock* loop = nullptr;
+
+ // compute the amount of memory to allocate to properly STACK_ALIGN.
+ size_t amount = 0;
+ if (size->IsCnsIntOrI())
+ {
+ // If size is a constant, then it must be contained.
+ assert(size->isContained());
+
+ // If amount is zero then return null in targetReg
+ amount = size->gtIntCon.gtIconVal;
+ if (amount == 0)
+ {
+ instGen_Set_Reg_To_Zero(EA_PTRSIZE, targetReg);
+ goto BAILOUT;
+ }
+
+ // 'amount' is the total number of bytes to localloc to properly STACK_ALIGN
+ amount = AlignUp(amount, STACK_ALIGN);
+ }
+ else
+ {
+ // The localloc requested memory size is non-constant.
+
+ // Put the size value in targetReg. If it is zero, bail out by returning null in targetReg.
+ genConsumeRegAndCopy(size, targetReg);
+ endLabel = genCreateTempLabel();
+ getEmitter()->emitIns_R_R(INS_test, easz, targetReg, targetReg);
+ inst_JMP(EJ_je, endLabel);
+
+ // Compute the size of the block to allocate and perform alignment.
+ // If compInitMem=true, we can reuse targetReg as regcnt,
+ // since we don't need any internal registers.
+ if (compiler->info.compInitMem)
+ {
+ assert(genCountBits(tmpRegsMask) == 0);
+ regCnt = targetReg;
+ }
+ else
+ {
+ assert(genCountBits(tmpRegsMask) >= 1);
+ regMaskTP regCntMask = genFindLowestBit(tmpRegsMask);
+ tmpRegsMask &= ~regCntMask;
+ regCnt = genRegNumFromMask(regCntMask);
+ if (regCnt != targetReg)
+ {
+ // Above, we put the size in targetReg. Now, copy it to our new temp register if necessary.
+ inst_RV_RV(INS_mov, regCnt, targetReg, size->TypeGet());
+ }
+ }
+
+ // Round up the number of bytes to allocate to a STACK_ALIGN boundary. This is done
+ // by code like:
+ // add reg, 15
+ // and reg, -16
+ // However, in the initialized memory case, we need the count of STACK_ALIGN-sized
+ // elements, not a byte count, after the alignment. So instead of the "and", which
+ // becomes unnecessary, generate a shift, e.g.:
+ // add reg, 15
+ // shr reg, 4
+
+ inst_RV_IV(INS_add, regCnt, STACK_ALIGN - 1, emitActualTypeSize(type));
+
+ if (compiler->info.compInitMem)
+ {
+ // Convert the count from a count of bytes to a loop count. We will loop once per
+ // stack alignment size, so each loop will zero 4 bytes on x86 and 16 bytes on x64.
+ // Note that we zero a single reg-size word per iteration on x86, and 2 reg-size
+ // words per iteration on x64. We will shift off all the stack alignment bits
+ // added above, so there is no need for an 'and' instruction.
+
+ // --- shr regCnt, 2 (or 4) ---
+ inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT_ALL);
+ }
+ else
+ {
+ // Otherwise, mask off the low bits to align the byte count.
+ inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type));
+ }
+ }
+
+#if FEATURE_FIXED_OUT_ARGS
+ // If we have an outgoing arg area then we must adjust the SP by popping off the
+ // outgoing arg area. We will restore it right before we return from this method.
+ //
+ // Localloc returns stack space that aligned to STACK_ALIGN bytes. The following
+ // are the cases that need to be handled:
+ // i) Method has out-going arg area.
+ // It is guaranteed that size of out-going arg area is STACK_ALIGN'ed (see fgMorphArgs).
+ // Therefore, we will pop off the out-going arg area from RSP before allocating the localloc space.
+ // ii) Method has no out-going arg area.
+ // Nothing to pop off from the stack.
+ if (compiler->lvaOutgoingArgSpaceSize > 0)
+ {
+ assert((compiler->lvaOutgoingArgSpaceSize % STACK_ALIGN) == 0); // This must be true for the stack to remain
+ // aligned
+ inst_RV_IV(INS_add, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize, EA_PTRSIZE);
+ stackAdjustment += compiler->lvaOutgoingArgSpaceSize;
+ }
+#endif
+
+ if (size->IsCnsIntOrI())
+ {
+ // We should reach here only for non-zero, constant size allocations.
+ assert(amount > 0);
+ assert((amount % STACK_ALIGN) == 0);
+ assert((amount % REGSIZE_BYTES) == 0);
+
+ // For small allocations we will generate up to six push 0 inline
+ size_t cntRegSizedWords = amount / REGSIZE_BYTES;
+ if (cntRegSizedWords <= 6)
+ {
+ for (; cntRegSizedWords != 0; cntRegSizedWords--)
+ {
+ inst_IV(INS_push_hide, 0); // push_hide means don't track the stack
+ }
+ goto ALLOC_DONE;
+ }
+
+ bool doNoInitLessThanOnePageAlloc =
+ !compiler->info.compInitMem && (amount < compiler->eeGetPageSize()); // must be < not <=
+
+#ifdef _TARGET_X86_
+ bool needRegCntRegister = true;
+#else // !_TARGET_X86_
+ bool needRegCntRegister = !doNoInitLessThanOnePageAlloc;
+#endif // !_TARGET_X86_
+
+ if (needRegCntRegister)
+ {
+ // If compInitMem=true, we can reuse targetReg as regcnt.
+ // Since size is a constant, regCnt is not yet initialized.
+ assert(regCnt == REG_NA);
+ if (compiler->info.compInitMem)
+ {
+ assert(genCountBits(tmpRegsMask) == 0);
+ regCnt = targetReg;
+ }
+ else
+ {
+ assert(genCountBits(tmpRegsMask) >= 1);
+ regMaskTP regCntMask = genFindLowestBit(tmpRegsMask);
+ tmpRegsMask &= ~regCntMask;
+ regCnt = genRegNumFromMask(regCntMask);
+ }
+ }
+
+ if (doNoInitLessThanOnePageAlloc)
+ {
+ // Since the size is less than a page, simply adjust ESP.
+ // ESP might already be in the guard page, so we must touch it BEFORE
+ // the alloc, not after.
+ CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef _TARGET_X86_
+ // For x86, we don't want to use "sub ESP" because we don't want the emitter to track the adjustment
+ // to ESP. So do the work in the count register.
+ // TODO-CQ: manipulate ESP directly, to share code, reduce #ifdefs, and improve CQ. This would require
+ // creating a way to temporarily turn off the emitter's tracking of ESP, maybe marking instrDescs as "don't
+ // track".
+ inst_RV_RV(INS_mov, regCnt, REG_SPBASE, TYP_I_IMPL);
+ getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
+ inst_RV_IV(INS_sub, regCnt, amount, EA_PTRSIZE);
+ inst_RV_RV(INS_mov, REG_SPBASE, regCnt, TYP_I_IMPL);
+#else // !_TARGET_X86_
+ getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
+ inst_RV_IV(INS_sub, REG_SPBASE, amount, EA_PTRSIZE);
+#endif // !_TARGET_X86_
+
+ goto ALLOC_DONE;
+ }
+
+ // else, "mov regCnt, amount"
+
+ if (compiler->info.compInitMem)
+ {
+ // When initializing memory, we want 'amount' to be the loop count.
+ assert((amount % STACK_ALIGN) == 0);
+ amount /= STACK_ALIGN;
+ }
+
+ genSetRegToIcon(regCnt, amount, ((int)amount == amount) ? TYP_INT : TYP_LONG);
+ }
+
+ loop = genCreateTempLabel();
+ if (compiler->info.compInitMem)
+ {
+ // At this point 'regCnt' is set to the number of loop iterations for this loop, if each
+ // iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes.
+ // Since we have to zero out the allocated memory AND ensure that RSP is always valid
+ // by tickling the pages, we will just push 0's on the stack.
+
+ assert(genIsValidIntReg(regCnt));
+
+ // Loop:
+ genDefineTempLabel(loop);
+
+#if defined(_TARGET_AMD64_)
+ // Push two 8-byte zeros. This matches the 16-byte STACK_ALIGN value.
+ static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));
+ inst_IV(INS_push_hide, 0); // --- push 8-byte 0
+ inst_IV(INS_push_hide, 0); // --- push 8-byte 0
+#elif defined(_TARGET_X86_)
+ // Push a single 4-byte zero. This matches the 4-byte STACK_ALIGN value.
+ static_assert_no_msg(STACK_ALIGN == REGSIZE_BYTES);
+ inst_IV(INS_push_hide, 0); // --- push 4-byte 0
+#endif // _TARGET_X86_
+
+ // Decrement the loop counter and loop if not done.
+ inst_RV(INS_dec, regCnt, TYP_I_IMPL);
+ inst_JMP(EJ_jne, loop);
+ }
+ else
+ {
+ // At this point 'regCnt' is set to the total number of bytes to localloc.
+ //
+ // We don't need to zero out the allocated memory. However, we do have
+ // to tickle the pages to ensure that ESP is always valid and is
+ // in sync with the "stack guard page". Note that in the worst
+ // case ESP is on the last byte of the guard page. Thus you must
+ // touch ESP+0 first not ESP+x01000.
+ //
+ // Another subtlety is that you don't want ESP to be exactly on the
+ // boundary of the guard page because PUSH is predecrement, thus
+ // call setup would not touch the guard page but just beyond it
+ //
+ // Note that we go through a few hoops so that ESP never points to
+ // illegal pages at any time during the tickling process
+ //
+ // neg REGCNT
+ // add REGCNT, ESP // reg now holds ultimate ESP
+ // jb loop // result is smaller than orignial ESP (no wrap around)
+ // xor REGCNT, REGCNT, // Overflow, pick lowest possible number
+ // loop:
+ // test ESP, [ESP+0] // tickle the page
+ // mov REGTMP, ESP
+ // sub REGTMP, PAGE_SIZE
+ // mov ESP, REGTMP
+ // cmp ESP, REGCNT
+ // jae loop
+ //
+ // mov ESP, REG
+ // end:
+ inst_RV(INS_NEG, regCnt, TYP_I_IMPL);
+ inst_RV_RV(INS_add, regCnt, REG_SPBASE, TYP_I_IMPL);
+ inst_JMP(EJ_jb, loop);
+
+ instGen_Set_Reg_To_Zero(EA_PTRSIZE, regCnt);
+
+ genDefineTempLabel(loop);
+
+ // Tickle the decremented value, and move back to ESP,
+ // note that it has to be done BEFORE the update of ESP since
+ // ESP might already be on the guard page. It is OK to leave
+ // the final value of ESP on the guard page
+ getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
+
+ // This is a harmless trick to avoid the emitter trying to track the
+ // decrement of the ESP - we do the subtraction in another reg instead
+ // of adjusting ESP directly.
+ assert(tmpRegsMask != RBM_NONE);
+ assert(genCountBits(tmpRegsMask) == 1);
+ regNumber regTmp = genRegNumFromMask(tmpRegsMask);
+
+ inst_RV_RV(INS_mov, regTmp, REG_SPBASE, TYP_I_IMPL);
+ inst_RV_IV(INS_sub, regTmp, compiler->eeGetPageSize(), EA_PTRSIZE);
+ inst_RV_RV(INS_mov, REG_SPBASE, regTmp, TYP_I_IMPL);
+
+ inst_RV_RV(INS_cmp, REG_SPBASE, regCnt, TYP_I_IMPL);
+ inst_JMP(EJ_jae, loop);
+
+ // Move the final value to ESP
+ inst_RV_RV(INS_mov, REG_SPBASE, regCnt);
+ }
+
+ALLOC_DONE:
+ // Re-adjust SP to allocate out-going arg area
+ if (stackAdjustment > 0)
+ {
+ assert((stackAdjustment % STACK_ALIGN) == 0); // This must be true for the stack to remain aligned
+ inst_RV_IV(INS_sub, REG_SPBASE, stackAdjustment, EA_PTRSIZE);
+ }
+
+ // Return the stackalloc'ed address in result register.
+ // TargetReg = RSP + stackAdjustment.
+ getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, targetReg, REG_SPBASE, stackAdjustment);
+
+ if (endLabel != nullptr)
+ {
+ genDefineTempLabel(endLabel);
+ }
+
+BAILOUT:
+
+ // Write the lvaLocAllocSPvar stack frame slot
+ noway_assert(compiler->lvaLocAllocSPvar != BAD_VAR_NUM);
+ getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaLocAllocSPvar, 0);
+
+#if STACK_PROBES
+ if (compiler->opts.compNeedStackProbes)
+ {
+ genGenerateStackProbe();
+ }
+#endif
+
+#ifdef DEBUG
+ // Update new ESP
+ if (compiler->opts.compStackCheckOnRet)
+ {
+ noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC &&
+ compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister &&
+ compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame);
+ getEmitter()->emitIns_S_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnEspCheck, 0);
+ }
+#endif
+
+ genProduceReg(tree);
+}
+
+void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
+{
+ if (storeBlkNode->gtBlkOpGcUnsafe)
+ {
+ getEmitter()->emitDisableGC();
+ }
+ bool isCopyBlk = storeBlkNode->OperIsCopyBlkOp();
+
+ switch (storeBlkNode->gtBlkOpKind)
+ {
+#ifdef _TARGET_AMD64_
+ case GenTreeBlk::BlkOpKindHelper:
+ if (isCopyBlk)
+ {
+ genCodeForCpBlk(storeBlkNode);
+ }
+ else
+ {
+ genCodeForInitBlk(storeBlkNode);
+ }
+ break;
+#endif // _TARGET_AMD64_
+ case GenTreeBlk::BlkOpKindRepInstr:
+ if (isCopyBlk)
+ {
+ genCodeForCpBlkRepMovs(storeBlkNode);
+ }
+ else
+ {
+ genCodeForInitBlkRepStos(storeBlkNode);
+ }
+ break;
+ case GenTreeBlk::BlkOpKindUnroll:
+ if (isCopyBlk)
+ {
+ genCodeForCpBlkUnroll(storeBlkNode);
+ }
+ else
+ {
+ genCodeForInitBlkUnroll(storeBlkNode);
+ }
+ break;
+ default:
+ unreached();
+ }
+ if (storeBlkNode->gtBlkOpGcUnsafe)
+ {
+ getEmitter()->emitEnableGC();
+ }
+}
+
+// Generate code for InitBlk using rep stos.
+// Preconditions:
+// The size of the buffers must be a constant and also less than INITBLK_STOS_LIMIT bytes.
+// Any value larger than that, we'll use the helper even if both the
+// fill byte and the size are integer constants.
+void CodeGen::genCodeForInitBlkRepStos(GenTreeBlk* initBlkNode)
+{
+ // Make sure we got the arguments of the initblk/initobj operation in the right registers
+ unsigned size = initBlkNode->Size();
+ GenTreePtr dstAddr = initBlkNode->Addr();
+ GenTreePtr initVal = initBlkNode->Data();
+
+#ifdef DEBUG
+ assert(!dstAddr->isContained());
+ assert(!initVal->isContained());
+#ifdef _TARGET_AMD64_
+ assert(size != 0);
+#endif
+ if (initVal->IsCnsIntOrI())
+ {
+#ifdef _TARGET_AMD64_
+ assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
+#else
+ assert(size > CPBLK_UNROLL_LIMIT);
+#endif
+ }
+
+#endif // DEBUG
+
+ genConsumeBlockOp(initBlkNode, REG_RDI, REG_RAX, REG_RCX);
+ instGen(INS_r_stosb);
+}
+
+// Generate code for InitBlk by performing a loop unroll
+// Preconditions:
+// a) Both the size and fill byte value are integer constants.
+// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes.
+//
+void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* initBlkNode)
+{
+ // Make sure we got the arguments of the initblk/initobj operation in the right registers
+ unsigned size = initBlkNode->Size();
+ GenTreePtr dstAddr = initBlkNode->Addr();
+ GenTreePtr initVal = initBlkNode->Data();
+
+ assert(!dstAddr->isContained());
+ assert(!initVal->isContained());
+ assert(size != 0);
+ assert(size <= INITBLK_UNROLL_LIMIT);
+ assert(initVal->gtSkipReloadOrCopy()->IsCnsIntOrI());
+
+ emitter* emit = getEmitter();
+
+ genConsumeOperands(initBlkNode);
+
+ // If the initVal was moved, or spilled and reloaded to a different register,
+ // get the original initVal from below the GT_RELOAD, but only after capturing the valReg,
+ // which needs to be the new register.
+ regNumber valReg = initVal->gtRegNum;
+ initVal = initVal->gtSkipReloadOrCopy();
+
+ unsigned offset = 0;
+
+ // Perform an unroll using SSE2 loads and stores.
+ if (size >= XMM_REGSIZE_BYTES)
+ {
+ regNumber tmpReg = genRegNumFromMask(initBlkNode->gtRsvdRegs);
+
+#ifdef DEBUG
+ assert(initBlkNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(initBlkNode->gtRsvdRegs) == 1);
+ assert(genIsValidFloatReg(tmpReg));
+#endif // DEBUG
+
+ if (initVal->gtIntCon.gtIconVal != 0)
+ {
+ emit->emitIns_R_R(INS_mov_i2xmm, EA_PTRSIZE, tmpReg, valReg);
+ emit->emitIns_R_R(INS_punpckldq, EA_8BYTE, tmpReg, tmpReg);
+#ifdef _TARGET_X86_
+ // For x86, we need one more to convert it from 8 bytes to 16 bytes.
+ emit->emitIns_R_R(INS_punpckldq, EA_8BYTE, tmpReg, tmpReg);
+#endif // _TARGET_X86_
+ }
+ else
+ {
+ emit->emitIns_R_R(INS_xorpd, EA_8BYTE, tmpReg, tmpReg);
+ }
+
+ // Determine how many 16 byte slots we're going to fill using SSE movs.
+ size_t slots = size / XMM_REGSIZE_BYTES;
+
+ while (slots-- > 0)
+ {
+ emit->emitIns_AR_R(INS_movdqu, EA_8BYTE, tmpReg, dstAddr->gtRegNum, offset);
+ offset += XMM_REGSIZE_BYTES;
+ }
+ }
+
+ // Fill the remainder (or a < 16 byte sized struct)
+ if ((size & 8) != 0)
+ {
+#ifdef _TARGET_X86_
+ // TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
+ emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
+ offset += 4;
+ emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
+ offset += 4;
+#else // !_TARGET_X86_
+ emit->emitIns_AR_R(INS_mov, EA_8BYTE, valReg, dstAddr->gtRegNum, offset);
+ offset += 8;
+#endif // !_TARGET_X86_
+ }
+ if ((size & 4) != 0)
+ {
+ emit->emitIns_AR_R(INS_mov, EA_4BYTE, valReg, dstAddr->gtRegNum, offset);
+ offset += 4;
+ }
+ if ((size & 2) != 0)
+ {
+ emit->emitIns_AR_R(INS_mov, EA_2BYTE, valReg, dstAddr->gtRegNum, offset);
+ offset += 2;
+ }
+ if ((size & 1) != 0)
+ {
+ emit->emitIns_AR_R(INS_mov, EA_1BYTE, valReg, dstAddr->gtRegNum, offset);
+ }
+}
+
+// Generates code for InitBlk by calling the VM memset helper function.
+// Preconditions:
+// a) The size argument of the InitBlk is not an integer constant.
+// b) The size argument of the InitBlk is >= INITBLK_STOS_LIMIT bytes.
+void CodeGen::genCodeForInitBlk(GenTreeBlk* initBlkNode)
+{
+#ifdef _TARGET_AMD64_
+ // Make sure we got the arguments of the initblk operation in the right registers
+ unsigned blockSize = initBlkNode->Size();
+ GenTreePtr dstAddr = initBlkNode->Addr();
+ GenTreePtr initVal = initBlkNode->Data();
+
+ assert(!dstAddr->isContained());
+ assert(!initVal->isContained());
+
+ if (blockSize != 0)
+ {
+ assert(blockSize >= CPBLK_MOVS_LIMIT);
+ }
+
+ genConsumeBlockOp(initBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
+
+ genEmitHelperCall(CORINFO_HELP_MEMSET, 0, EA_UNKNOWN);
+#else // !_TARGET_AMD64_
+ NYI_X86("Helper call for InitBlk");
+#endif // !_TARGET_AMD64_
+}
+
+// Generate code for a load from some address + offset
+// baseNode: tree node which can be either a local address or arbitrary node
+// offset: distance from the baseNode from which to load
+void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* baseNode, unsigned offset)
+{
+ emitter* emit = getEmitter();
+
+ if (baseNode->OperIsLocalAddr())
+ {
+ if (baseNode->gtOper == GT_LCL_FLD_ADDR)
+ {
+ offset += baseNode->gtLclFld.gtLclOffs;
+ }
+ emit->emitIns_R_S(ins, size, dst, baseNode->gtLclVarCommon.gtLclNum, offset);
+ }
+ else
+ {
+ emit->emitIns_R_AR(ins, size, dst, baseNode->gtRegNum, offset);
+ }
+}
+
+//------------------------------------------------------------------------
+// genCodeForStoreOffset: Generate code to store a reg to [base + offset].
+//
+// Arguments:
+// ins - the instruction to generate.
+// size - the size that needs to be stored.
+// src - the register which needs to be stored.
+// baseNode - the base, relative to which to store the src register.
+// offset - the offset that is added to the baseNode to calculate the address to store into.
+//
+void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber src, GenTree* baseNode, unsigned offset)
+{
+ emitter* emit = getEmitter();
+
+ if (baseNode->OperIsLocalAddr())
+ {
+ if (baseNode->gtOper == GT_LCL_FLD_ADDR)
+ {
+ offset += baseNode->gtLclFld.gtLclOffs;
+ }
+
+ emit->emitIns_S_R(ins, size, src, baseNode->AsLclVarCommon()->GetLclNum(), offset);
+ }
+ else
+ {
+ emit->emitIns_AR_R(ins, size, src, baseNode->gtRegNum, offset);
+ }
+}
+
+// Generates CpBlk code by performing a loop unroll
+// Preconditions:
+// The size argument of the CpBlk node is a constant and <= 64 bytes.
+// This may seem small but covers >95% of the cases in several framework assemblies.
+//
+void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
+{
+ // Make sure we got the arguments of the cpblk operation in the right registers
+ unsigned size = cpBlkNode->Size();
+ GenTreePtr dstAddr = cpBlkNode->Addr();
+ GenTreePtr source = cpBlkNode->Data();
+ GenTreePtr srcAddr = nullptr;
+ assert(size <= CPBLK_UNROLL_LIMIT);
+
+ emitter* emit = getEmitter();
+
+ if (source->gtOper == GT_IND)
+ {
+ srcAddr = source->gtGetOp1();
+ if (!srcAddr->isContained())
+ {
+ genConsumeReg(srcAddr);
+ }
+ }
+ else
+ {
+ noway_assert(source->IsLocal());
+ // TODO-Cleanup: Consider making the addrForm() method in Rationalize public, e.g. in GenTree.
+ // OR: transform source to GT_IND(GT_LCL_VAR_ADDR)
+ if (source->OperGet() == GT_LCL_VAR)
+ {
+ source->SetOper(GT_LCL_VAR_ADDR);
+ }
+ else
+ {
+ assert(source->OperGet() == GT_LCL_FLD);
+ source->SetOper(GT_LCL_FLD_ADDR);
+ }
+ srcAddr = source;
+ }
+
+ if (!dstAddr->isContained())
+ {
+ genConsumeReg(dstAddr);
+ }
+
+ unsigned offset = 0;
+
+ // If the size of this struct is larger than 16 bytes
+ // let's use SSE2 to be able to do 16 byte at a time
+ // loads and stores.
+
+ if (size >= XMM_REGSIZE_BYTES)
+ {
+ assert(cpBlkNode->gtRsvdRegs != RBM_NONE);
+ regNumber xmmReg = genRegNumFromMask(cpBlkNode->gtRsvdRegs & RBM_ALLFLOAT);
+ assert(genIsValidFloatReg(xmmReg));
+ size_t slots = size / XMM_REGSIZE_BYTES;
+
+ // TODO: In the below code the load and store instructions are for 16 bytes, but the
+ // type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
+ // this probably needs to be changed.
+ while (slots-- > 0)
+ {
+ // Load
+ genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmReg, srcAddr, offset);
+ // Store
+ genCodeForStoreOffset(INS_movdqu, EA_8BYTE, xmmReg, dstAddr, offset);
+ offset += XMM_REGSIZE_BYTES;
+ }
+ }
+
+ // Fill the remainder (15 bytes or less) if there's one.
+ if ((size & 0xf) != 0)
+ {
+ // Grab the integer temp register to emit the remaining loads and stores.
+ regNumber tmpReg = genRegNumFromMask(cpBlkNode->gtRsvdRegs & RBM_ALLINT);
+
+ if ((size & 8) != 0)
+ {
+#ifdef _TARGET_X86_
+ // TODO-X86-CQ: [1091735] Revisit block ops codegen. One example: use movq for 8 byte movs.
+ for (unsigned savedOffs = offset; offset < savedOffs + 8; offset += 4)
+ {
+ genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
+ genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
+ }
+#else // !_TARGET_X86_
+ genCodeForLoadOffset(INS_mov, EA_8BYTE, tmpReg, srcAddr, offset);
+ genCodeForStoreOffset(INS_mov, EA_8BYTE, tmpReg, dstAddr, offset);
+ offset += 8;
+#endif // !_TARGET_X86_
+ }
+ if ((size & 4) != 0)
+ {
+ genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset);
+ genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset);
+ offset += 4;
+ }
+ if ((size & 2) != 0)
+ {
+ genCodeForLoadOffset(INS_mov, EA_2BYTE, tmpReg, srcAddr, offset);
+ genCodeForStoreOffset(INS_mov, EA_2BYTE, tmpReg, dstAddr, offset);
+ offset += 2;
+ }
+ if ((size & 1) != 0)
+ {
+ genCodeForLoadOffset(INS_mov, EA_1BYTE, tmpReg, srcAddr, offset);
+ genCodeForStoreOffset(INS_mov, EA_1BYTE, tmpReg, dstAddr, offset);
+ }
+ }
+}
+
+// Generate code for CpBlk by using rep movs
+// Preconditions:
+// The size argument of the CpBlk is a constant and is between
+// CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
+void CodeGen::genCodeForCpBlkRepMovs(GenTreeBlk* cpBlkNode)
+{
+ // Make sure we got the arguments of the cpblk operation in the right registers
+ unsigned size = cpBlkNode->Size();
+ GenTreePtr dstAddr = cpBlkNode->Addr();
+ GenTreePtr source = cpBlkNode->Data();
+ GenTreePtr srcAddr = nullptr;
+
+#ifdef DEBUG
+ assert(!dstAddr->isContained());
+ assert(source->isContained());
+
+#ifdef _TARGET_X86_
+ if (size == 0)
+ {
+ noway_assert(cpBlkNode->OperGet() == GT_STORE_DYN_BLK);
+ }
+ else
+#endif
+ {
+#ifdef _TARGET_X64_
+ assert(size > CPBLK_UNROLL_LIMIT && size < CPBLK_MOVS_LIMIT);
+#else
+ assert(size > CPBLK_UNROLL_LIMIT);
+#endif
+ }
+#endif // DEBUG
+
+ genConsumeBlockOp(cpBlkNode, REG_RDI, REG_RSI, REG_RCX);
+ instGen(INS_r_movsb);
+}
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+//---------------------------------------------------------------------------------------------------------------//
+// genStructPutArgUnroll: Generates code for passing a struct arg on stack by value using loop unrolling.
+//
+// Arguments:
+// putArgNode - the PutArgStk tree.
+// baseVarNum - the base var number, relative to which the by-val struct will be copied on the stack.
+//
+// TODO-Amd64-Unix: Try to share code with copyblk.
+// Need refactoring of copyblk before it could be used for putarg_stk.
+// The difference for now is that a putarg_stk contains its children, while cpyblk does not.
+// This creates differences in code. After some significant refactoring it could be reused.
+//
+void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode, unsigned baseVarNum)
+{
+ // We will never call this method for SIMD types, which are stored directly
+ // in genPutStructArgStk().
+ noway_assert(putArgNode->TypeGet() == TYP_STRUCT);
+
+ // Make sure we got the arguments of the cpblk operation in the right registers
+ GenTreePtr dstAddr = putArgNode;
+ GenTreePtr src = putArgNode->gtOp.gtOp1;
+
+ size_t size = putArgNode->getArgSize();
+ assert(size <= CPBLK_UNROLL_LIMIT);
+
+ emitter* emit = getEmitter();
+ unsigned putArgOffset = putArgNode->getArgOffset();
+
+ assert(src->isContained());
+
+ assert(src->gtOper == GT_OBJ);
+
+ if (!src->gtOp.gtOp1->isContained())
+ {
+ genConsumeReg(src->gtOp.gtOp1);
+ }
+
+ unsigned offset = 0;
+
+ // If the size of this struct is larger than 16 bytes
+ // let's use SSE2 to be able to do 16 byte at a time
+ // loads and stores.
+ if (size >= XMM_REGSIZE_BYTES)
+ {
+ assert(putArgNode->gtRsvdRegs != RBM_NONE);
+ regNumber xmmReg = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLFLOAT);
+ assert(genIsValidFloatReg(xmmReg));
+ size_t slots = size / XMM_REGSIZE_BYTES;
+
+ assert(putArgNode->gtGetOp1()->isContained());
+ assert(putArgNode->gtGetOp1()->gtOp.gtOper == GT_OBJ);
+
+ // TODO: In the below code the load and store instructions are for 16 bytes, but the
+ // type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
+ // this probably needs to be changed.
+ while (slots-- > 0)
+ {
+ // Load
+ genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmReg, src->gtGetOp1(),
+ offset); // Load the address of the child of the Obj node.
+
+ // Store
+ emit->emitIns_S_R(INS_movdqu, EA_8BYTE, xmmReg, baseVarNum, putArgOffset + offset);
+
+ offset += XMM_REGSIZE_BYTES;
+ }
+ }
+
+ // Fill the remainder (15 bytes or less) if there's one.
+ if ((size & 0xf) != 0)
+ {
+ // Grab the integer temp register to emit the remaining loads and stores.
+ regNumber tmpReg = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLINT);
+ assert(genIsValidIntReg(tmpReg));
+
+ if ((size & 8) != 0)
+ {
+ genCodeForLoadOffset(INS_mov, EA_8BYTE, tmpReg, src->gtOp.gtOp1, offset);
+
+ emit->emitIns_S_R(INS_mov, EA_8BYTE, tmpReg, baseVarNum, putArgOffset + offset);
+
+ offset += 8;
+ }
+
+ if ((size & 4) != 0)
+ {
+ genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, src->gtOp.gtOp1, offset);
+
+ emit->emitIns_S_R(INS_mov, EA_4BYTE, tmpReg, baseVarNum, putArgOffset + offset);
+
+ offset += 4;
+ }
+
+ if ((size & 2) != 0)
+ {
+ genCodeForLoadOffset(INS_mov, EA_2BYTE, tmpReg, src->gtOp.gtOp1, offset);
+
+ emit->emitIns_S_R(INS_mov, EA_2BYTE, tmpReg, baseVarNum, putArgOffset + offset);
+
+ offset += 2;
+ }
+
+ if ((size & 1) != 0)
+ {
+ genCodeForLoadOffset(INS_mov, EA_1BYTE, tmpReg, src->gtOp.gtOp1, offset);
+ emit->emitIns_S_R(INS_mov, EA_1BYTE, tmpReg, baseVarNum, putArgOffset + offset);
+ }
+ }
+}
+
+//------------------------------------------------------------------------
+// genStructPutArgRepMovs: Generates code for passing a struct arg by value on stack using Rep Movs.
+//
+// Arguments:
+// putArgNode - the PutArgStk tree.
+// baseVarNum - the base var number, relative to which the by-val struct bits will go.
+//
+// Preconditions:
+// The size argument of the PutArgStk (for structs) is a constant and is between
+// CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
+//
+void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode, unsigned baseVarNum)
+{
+ assert(putArgNode->TypeGet() == TYP_STRUCT);
+ assert(putArgNode->getArgSize() > CPBLK_UNROLL_LIMIT);
+ assert(baseVarNum != BAD_VAR_NUM);
+
+ // Make sure we got the arguments of the cpblk operation in the right registers
+ GenTreePtr dstAddr = putArgNode;
+ GenTreePtr srcAddr = putArgNode->gtGetOp1();
+
+ // Validate state.
+ assert(putArgNode->gtRsvdRegs == (RBM_RDI | RBM_RCX | RBM_RSI));
+ assert(srcAddr->isContained());
+
+ genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX, baseVarNum);
+ instGen(INS_r_movsb);
+}
+
+//------------------------------------------------------------------------
+// If any Vector3 args are on stack and they are not pass-by-ref, the upper 32bits
+// must be cleared to zeroes. The native compiler doesn't clear the upper bits
+// and there is no way to know if the caller is native or not. So, the upper
+// 32 bits of Vector argument on stack are always cleared to zero.
+#ifdef FEATURE_SIMD
+void CodeGen::genClearStackVec3ArgUpperBits()
+{
+#ifdef DEBUG
+ if (verbose)
+ printf("*************** In genClearStackVec3ArgUpperBits()\n");
+#endif
+
+ assert(compiler->compGeneratingProlog);
+
+ unsigned varNum = 0;
+
+ for (unsigned varNum = 0; varNum < compiler->info.compArgsCount; varNum++)
+ {
+ LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
+ assert(varDsc->lvIsParam);
+
+ // Does var has simd12 type?
+ if (varDsc->lvType != TYP_SIMD12)
+ {
+ continue;
+ }
+
+ if (!varDsc->lvIsRegArg)
+ {
+ // Clear the upper 32 bits by mov dword ptr [V_ARG_BASE+0xC], 0
+ getEmitter()->emitIns_S_I(ins_Store(TYP_INT), EA_4BYTE, varNum, genTypeSize(TYP_FLOAT) * 3, 0);
+ }
+ else
+ {
+ // Assume that for x64 linux, an argument is fully in registers
+ // or fully on stack.
+ regNumber argReg = varDsc->GetOtherArgReg();
+
+ // Clear the upper 32 bits by two shift instructions.
+ // argReg = argReg << 96
+ getEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
+ // argReg = argReg >> 96
+ getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
+ }
+ }
+}
+#endif // FEATURE_SIMD
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+// Generate code for CpObj nodes wich copy structs that have interleaved
+// GC pointers.
+// This will generate a sequence of movsq instructions for the cases of non-gc members
+// and calls to the BY_REF_ASSIGN helper otherwise.
+void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
+{
+ // Make sure we got the arguments of the cpobj operation in the right registers
+ GenTreePtr dstAddr = cpObjNode->Addr();
+ GenTreePtr source = cpObjNode->Data();
+ GenTreePtr srcAddr = nullptr;
+ bool sourceIsLocal = false;
+
+ assert(source->isContained());
+ if (source->gtOper == GT_IND)
+ {
+ srcAddr = source->gtGetOp1();
+ assert(!srcAddr->isContained());
+ }
+ else
+ {
+ noway_assert(source->IsLocal());
+ sourceIsLocal = true;
+ // TODO: Consider making the addrForm() method in Rationalize public, e.g. in GenTree.
+ // OR: transform source to GT_IND(GT_LCL_VAR_ADDR)
+ if (source->OperGet() == GT_LCL_VAR)
+ {
+ source->SetOper(GT_LCL_VAR_ADDR);
+ }
+ else
+ {
+ assert(source->OperGet() == GT_LCL_FLD);
+ source->SetOper(GT_LCL_FLD_ADDR);
+ }
+ srcAddr = source;
+ }
+
+ bool dstOnStack = dstAddr->OperIsLocalAddr();
+
+#ifdef DEBUG
+ bool isRepMovsqUsed = false;
+
+ assert(!dstAddr->isContained());
+
+ // If the GenTree node has data about GC pointers, this means we're dealing
+ // with CpObj, so this requires special logic.
+ assert(cpObjNode->gtGcPtrCount > 0);
+
+ // MovSq instruction is used for copying non-gcref fields and it needs
+ // src = RSI and dst = RDI.
+ // Either these registers must not contain lclVars, or they must be dying or marked for spill.
+ // This is because these registers are incremented as we go through the struct.
+ GenTree* actualSrcAddr = srcAddr->gtSkipReloadOrCopy();
+ GenTree* actualDstAddr = dstAddr->gtSkipReloadOrCopy();
+ unsigned srcLclVarNum = BAD_VAR_NUM;
+ unsigned dstLclVarNum = BAD_VAR_NUM;
+ bool isSrcAddrLiveOut = false;
+ bool isDstAddrLiveOut = false;
+ if (genIsRegCandidateLocal(actualSrcAddr))
+ {
+ srcLclVarNum = actualSrcAddr->AsLclVarCommon()->gtLclNum;
+ isSrcAddrLiveOut = ((actualSrcAddr->gtFlags & (GTF_VAR_DEATH | GTF_SPILL)) == 0);
+ }
+ if (genIsRegCandidateLocal(actualDstAddr))
+ {
+ dstLclVarNum = actualDstAddr->AsLclVarCommon()->gtLclNum;
+ isDstAddrLiveOut = ((actualDstAddr->gtFlags & (GTF_VAR_DEATH | GTF_SPILL)) == 0);
+ }
+ assert((actualSrcAddr->gtRegNum != REG_RSI) || !isSrcAddrLiveOut ||
+ ((srcLclVarNum == dstLclVarNum) && !isDstAddrLiveOut));
+ assert((actualDstAddr->gtRegNum != REG_RDI) || !isDstAddrLiveOut ||
+ ((srcLclVarNum == dstLclVarNum) && !isSrcAddrLiveOut));
+#endif // DEBUG
+
+ // Consume these registers.
+ // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
+ if (sourceIsLocal)
+ {
+ inst_RV_TT(INS_lea, REG_RSI, source, 0, EA_BYREF);
+ genConsumeBlockOp(cpObjNode, REG_RDI, REG_NA, REG_NA);
+ }
+ else
+ {
+ genConsumeBlockOp(cpObjNode, REG_RDI, REG_RSI, REG_NA);
+ }
+ gcInfo.gcMarkRegPtrVal(REG_RSI, srcAddr->TypeGet());
+ gcInfo.gcMarkRegPtrVal(REG_RDI, dstAddr->TypeGet());
+
+ unsigned slots = cpObjNode->gtSlots;
+
+ // If we can prove it's on the stack we don't need to use the write barrier.
+ if (dstOnStack)
+ {
+ if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
+ {
+#ifdef DEBUG
+ // If the destination of the CpObj is on the stack
+ // make sure we allocated RCX to emit rep movsq.
+ regNumber tmpReg = genRegNumFromMask(cpObjNode->gtRsvdRegs & RBM_ALLINT);
+ assert(tmpReg == REG_RCX);
+ isRepMovsqUsed = true;
+#endif // DEBUG
+
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, slots);
+ instGen(INS_r_movsq);
+ }
+ else
+ {
+ // For small structs, it's better to emit a sequence of movsq than to
+ // emit a rep movsq instruction.
+ while (slots > 0)
+ {
+ instGen(INS_movsq);
+ slots--;
+ }
+ }
+ }
+ else
+ {
+ BYTE* gcPtrs = cpObjNode->gtGcPtrs;
+ unsigned gcPtrCount = cpObjNode->gtGcPtrCount;
+
+ unsigned i = 0;
+ while (i < slots)
+ {
+ switch (gcPtrs[i])
+ {
+ case TYPE_GC_NONE:
+ // Let's see if we can use rep movsq instead of a sequence of movsq instructions
+ // to save cycles and code size.
+ {
+ unsigned nonGcSlotCount = 0;
+
+ do
+ {
+ nonGcSlotCount++;
+ i++;
+ } while (i < slots && gcPtrs[i] == TYPE_GC_NONE);
+
+ // If we have a very small contiguous non-gc region, it's better just to
+ // emit a sequence of movsq instructions
+ if (nonGcSlotCount < CPOBJ_NONGC_SLOTS_LIMIT)
+ {
+ while (nonGcSlotCount > 0)
+ {
+ instGen(INS_movsq);
+ nonGcSlotCount--;
+ }
+ }
+ else
+ {
+#ifdef DEBUG
+ // Otherwise, we can save code-size and improve CQ by emitting
+ // rep movsq
+ regNumber tmpReg = genRegNumFromMask(cpObjNode->gtRsvdRegs & RBM_ALLINT);
+ assert(tmpReg == REG_RCX);
+ isRepMovsqUsed = true;
+#endif // DEBUG
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, nonGcSlotCount);
+ instGen(INS_r_movsq);
+ }
+ }
+ break;
+ default:
+ // We have a GC pointer, call the memory barrier.
+ genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE);
+ gcPtrCount--;
+ i++;
+ }
+ }
+
+ assert(gcPtrCount == 0);
+ }
+
+ // Clear the gcInfo for RSI and RDI.
+ // While we normally update GC info prior to the last instruction that uses them,
+ // these actually live into the helper call.
+ gcInfo.gcMarkRegSetNpt(RBM_RSI);
+ gcInfo.gcMarkRegSetNpt(RBM_RDI);
+}
+
+// Generate code for a CpBlk node by the means of the VM memcpy helper call
+// Preconditions:
+// a) The size argument of the CpBlk is not an integer constant
+// b) The size argument is a constant but is larger than CPBLK_MOVS_LIMIT bytes.
+void CodeGen::genCodeForCpBlk(GenTreeBlk* cpBlkNode)
+{
+#ifdef _TARGET_AMD64_
+ // Make sure we got the arguments of the cpblk operation in the right registers
+ unsigned blockSize = cpBlkNode->Size();
+ GenTreePtr dstAddr = cpBlkNode->Addr();
+ GenTreePtr source = cpBlkNode->Data();
+ GenTreePtr srcAddr = nullptr;
+
+ // Size goes in arg2
+ if (blockSize != 0)
+ {
+ assert(blockSize >= CPBLK_MOVS_LIMIT);
+ assert((cpBlkNode->gtRsvdRegs & RBM_ARG_2) != 0);
+ }
+ else
+ {
+ noway_assert(cpBlkNode->gtOper == GT_STORE_DYN_BLK);
+ }
+
+ // Source address goes in arg1
+ if (source->gtOper == GT_IND)
+ {
+ srcAddr = source->gtGetOp1();
+ assert(!srcAddr->isContained());
+ }
+ else
+ {
+ noway_assert(source->IsLocal());
+ assert((cpBlkNode->gtRsvdRegs & RBM_ARG_1) != 0);
+ inst_RV_TT(INS_lea, REG_ARG_1, source, 0, EA_BYREF);
+ }
+
+ genConsumeBlockOp(cpBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
+
+ genEmitHelperCall(CORINFO_HELP_MEMCPY, 0, EA_UNKNOWN);
+#else // !_TARGET_AMD64_
+ noway_assert(false && "Helper call for CpBlk is not needed.");
+#endif // !_TARGET_AMD64_
+}
+
+// generate code do a switch statement based on a table of ip-relative offsets
+void CodeGen::genTableBasedSwitch(GenTree* treeNode)
+{
+ genConsumeOperands(treeNode->AsOp());
+ regNumber idxReg = treeNode->gtOp.gtOp1->gtRegNum;
+ regNumber baseReg = treeNode->gtOp.gtOp2->gtRegNum;
+
+ regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+
+ // load the ip-relative offset (which is relative to start of fgFirstBB)
+ getEmitter()->emitIns_R_ARX(INS_mov, EA_4BYTE, baseReg, baseReg, idxReg, 4, 0);
+
+ // add it to the absolute address of fgFirstBB
+ compiler->fgFirstBB->bbFlags |= BBF_JMP_TARGET;
+ getEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, tmpReg);
+ getEmitter()->emitIns_R_R(INS_add, EA_PTRSIZE, baseReg, tmpReg);
+ // jmp baseReg
+ getEmitter()->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), baseReg);
+}
+
+// emits the table and an instruction to get the address of the first element
+void CodeGen::genJumpTable(GenTree* treeNode)
+{
+ noway_assert(compiler->compCurBB->bbJumpKind == BBJ_SWITCH);
+ assert(treeNode->OperGet() == GT_JMPTABLE);
+
+ unsigned jumpCount = compiler->compCurBB->bbJumpSwt->bbsCount;
+ BasicBlock** jumpTable = compiler->compCurBB->bbJumpSwt->bbsDstTab;
+ unsigned jmpTabOffs;
+ unsigned jmpTabBase;
+
+ jmpTabBase = getEmitter()->emitBBTableDataGenBeg(jumpCount, true);
+
+ jmpTabOffs = 0;
+
+ JITDUMP("\n J_M%03u_DS%02u LABEL DWORD\n", Compiler::s_compMethodsCount, jmpTabBase);
+
+ for (unsigned i = 0; i < jumpCount; i++)
+ {
+ BasicBlock* target = *jumpTable++;
+ noway_assert(target->bbFlags & BBF_JMP_TARGET);
+
+ JITDUMP(" DD L_M%03u_BB%02u\n", Compiler::s_compMethodsCount, target->bbNum);
+
+ getEmitter()->emitDataGenData(i, target);
+ };
+
+ getEmitter()->emitDataGenEnd();
+
+ // Access to inline data is 'abstracted' by a special type of static member
+ // (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
+ // to constant data, not a real static field.
+ getEmitter()->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), treeNode->gtRegNum,
+ compiler->eeFindJitDataOffs(jmpTabBase), 0);
+ genProduceReg(treeNode);
+}
+
+// generate code for the locked operations:
+// GT_LOCKADD, GT_XCHG, GT_XADD
+void CodeGen::genLockedInstructions(GenTree* treeNode)
+{
+ GenTree* data = treeNode->gtOp.gtOp2;
+ GenTree* addr = treeNode->gtOp.gtOp1;
+ regNumber targetReg = treeNode->gtRegNum;
+ regNumber dataReg = data->gtRegNum;
+ regNumber addrReg = addr->gtRegNum;
+ instruction ins;
+
+ // all of these nodes implicitly do an indirection on op1
+ // so create a temporary node to feed into the pattern matching
+ GenTreeIndir i = indirForm(data->TypeGet(), addr);
+ genConsumeReg(addr);
+
+ // The register allocator should have extended the lifetime of the address
+ // so that it is not used as the target.
+ noway_assert(addrReg != targetReg);
+
+ // If data is a lclVar that's not a last use, we'd better have allocated a register
+ // for the result (except in the case of GT_LOCKADD which does not produce a register result).
+ assert(targetReg != REG_NA || treeNode->OperGet() == GT_LOCKADD || !genIsRegCandidateLocal(data) ||
+ (data->gtFlags & GTF_VAR_DEATH) != 0);
+
+ genConsumeIfReg(data);
+ if (targetReg != REG_NA && dataReg != REG_NA && dataReg != targetReg)
+ {
+ inst_RV_RV(ins_Copy(data->TypeGet()), targetReg, dataReg);
+ data->gtRegNum = targetReg;
+
+ // TODO-XArch-Cleanup: Consider whether it is worth it, for debugging purposes, to restore the
+ // original gtRegNum on data, after calling emitInsBinary below.
+ }
+ switch (treeNode->OperGet())
+ {
+ case GT_LOCKADD:
+ instGen(INS_lock);
+ ins = INS_add;
+ break;
+ case GT_XCHG:
+ // lock is implied by xchg
+ ins = INS_xchg;
+ break;
+ case GT_XADD:
+ instGen(INS_lock);
+ ins = INS_xadd;
+ break;
+ default:
+ unreached();
+ }
+ getEmitter()->emitInsBinary(ins, emitTypeSize(data), &i, data);
+
+ if (treeNode->gtRegNum != REG_NA)
+ {
+ genProduceReg(treeNode);
+ }
+}
+
+// generate code for BoundsCheck nodes
+void CodeGen::genRangeCheck(GenTreePtr oper)
+{
+#ifdef FEATURE_SIMD
+ noway_assert(oper->OperGet() == GT_ARR_BOUNDS_CHECK || oper->OperGet() == GT_SIMD_CHK);
+#else // !FEATURE_SIMD
+ noway_assert(oper->OperGet() == GT_ARR_BOUNDS_CHECK);
+#endif // !FEATURE_SIMD
+
+ GenTreeBoundsChk* bndsChk = oper->AsBoundsChk();
+
+ GenTreePtr arrLen = bndsChk->gtArrLen;
+ GenTreePtr arrIndex = bndsChk->gtIndex;
+ GenTreePtr arrRef = nullptr;
+ int lenOffset = 0;
+
+ GenTree * src1, *src2;
+ emitJumpKind jmpKind;
+
+ genConsumeRegs(arrLen);
+ genConsumeRegs(arrIndex);
+
+ if (arrIndex->isContainedIntOrIImmed())
+ {
+ // arrIndex is a contained constant. In this case
+ // we will generate one of the following
+ // cmp [mem], immed (if arrLen is a memory op)
+ // cmp reg, immed (if arrLen is in a reg)
+ //
+ // That is arrLen cannot be a contained immed.
+ assert(!arrLen->isContainedIntOrIImmed());
+
+ src1 = arrLen;
+ src2 = arrIndex;
+ jmpKind = EJ_jbe;
+ }
+ else
+ {
+ // arrIndex could either be a contained memory op or a reg
+ // In this case we will generate one of the following
+ // cmp [mem], immed (if arrLen is a constant)
+ // cmp [mem], reg (if arrLen is in a reg)
+ // cmp reg, immed (if arrIndex is in a reg)
+ // cmp reg1, reg2 (if arraIndex is in reg1)
+ // cmp reg, [mem] (if arrLen is a memory op)
+ //
+ // That is only one of arrIndex or arrLen can be a memory op.
+ assert(!arrIndex->isContainedMemoryOp() || !arrLen->isContainedMemoryOp());
+
+ src1 = arrIndex;
+ src2 = arrLen;
+ jmpKind = EJ_jae;
+ }
+
+ var_types bndsChkType = src2->TypeGet();
+#if DEBUG
+ // Bounds checks can only be 32 or 64 bit sized comparisons.
+ assert(bndsChkType == TYP_INT || bndsChkType == TYP_LONG);
+
+ // The type of the bounds check should always wide enough to compare against the index.
+ assert(emitTypeSize(bndsChkType) >= emitTypeSize(src1->TypeGet()));
+#endif // DEBUG
+
+ getEmitter()->emitInsBinary(INS_cmp, emitTypeSize(bndsChkType), src1, src2);
+ genJumpToThrowHlpBlk(jmpKind, bndsChk->gtThrowKind, bndsChk->gtIndRngFailBB);
+}
+
+//------------------------------------------------------------------------
+// genOffsetOfMDArrayLowerBound: Returns the offset from the Array object to the
+// lower bound for the given dimension.
+//
+// Arguments:
+// elemType - the element type of the array
+// rank - the rank of the array
+// dimension - the dimension for which the lower bound offset will be returned.
+//
+// Return Value:
+// The offset.
+
+unsigned CodeGen::genOffsetOfMDArrayLowerBound(var_types elemType, unsigned rank, unsigned dimension)
+{
+ // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
+ return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * (dimension + rank);
+}
+
+//------------------------------------------------------------------------
+// genOffsetOfMDArrayLength: Returns the offset from the Array object to the
+// size for the given dimension.
+//
+// Arguments:
+// elemType - the element type of the array
+// rank - the rank of the array
+// dimension - the dimension for which the lower bound offset will be returned.
+//
+// Return Value:
+// The offset.
+
+unsigned CodeGen::genOffsetOfMDArrayDimensionSize(var_types elemType, unsigned rank, unsigned dimension)
+{
+ // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
+ return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * dimension;
+}
+
+//------------------------------------------------------------------------
+// genCodeForArrIndex: Generates code to bounds check the index for one dimension of an array reference,
+// producing the effective index by subtracting the lower bound.
+//
+// Arguments:
+// arrIndex - the node for which we're generating code
+//
+// Return Value:
+// None.
+//
+
+void CodeGen::genCodeForArrIndex(GenTreeArrIndex* arrIndex)
+{
+ GenTreePtr arrObj = arrIndex->ArrObj();
+ GenTreePtr indexNode = arrIndex->IndexExpr();
+
+ regNumber arrReg = genConsumeReg(arrObj);
+ regNumber indexReg = genConsumeReg(indexNode);
+ regNumber tgtReg = arrIndex->gtRegNum;
+
+ unsigned dim = arrIndex->gtCurrDim;
+ unsigned rank = arrIndex->gtArrRank;
+ var_types elemType = arrIndex->gtArrElemType;
+
+ noway_assert(tgtReg != REG_NA);
+
+ // Subtract the lower bound for this dimension.
+ // TODO-XArch-CQ: make this contained if it's an immediate that fits.
+ if (tgtReg != indexReg)
+ {
+ inst_RV_RV(INS_mov, tgtReg, indexReg, indexNode->TypeGet());
+ }
+ getEmitter()->emitIns_R_AR(INS_sub, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
+ genOffsetOfMDArrayLowerBound(elemType, rank, dim));
+ getEmitter()->emitIns_R_AR(INS_cmp, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
+ genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
+ genJumpToThrowHlpBlk(EJ_jae, SCK_RNGCHK_FAIL);
+
+ genProduceReg(arrIndex);
+}
+
+//------------------------------------------------------------------------
+// genCodeForArrOffset: Generates code to compute the flattened array offset for
+// one dimension of an array reference:
+// result = (prevDimOffset * dimSize) + effectiveIndex
+// where dimSize is obtained from the arrObj operand
+//
+// Arguments:
+// arrOffset - the node for which we're generating code
+//
+// Return Value:
+// None.
+//
+// Notes:
+// dimSize and effectiveIndex are always non-negative, the former by design,
+// and the latter because it has been normalized to be zero-based.
+
+void CodeGen::genCodeForArrOffset(GenTreeArrOffs* arrOffset)
+{
+ GenTreePtr offsetNode = arrOffset->gtOffset;
+ GenTreePtr indexNode = arrOffset->gtIndex;
+ GenTreePtr arrObj = arrOffset->gtArrObj;
+
+ regNumber tgtReg = arrOffset->gtRegNum;
+
+ noway_assert(tgtReg != REG_NA);
+
+ unsigned dim = arrOffset->gtCurrDim;
+ unsigned rank = arrOffset->gtArrRank;
+ var_types elemType = arrOffset->gtArrElemType;
+
+ // We will use a temp register for the offset*scale+effectiveIndex computation.
+ regMaskTP tmpRegMask = arrOffset->gtRsvdRegs;
+ regNumber tmpReg = genRegNumFromMask(tmpRegMask);
+
+ // First, consume the operands in the correct order.
+ regNumber offsetReg = REG_NA;
+ if (!offsetNode->IsIntegralConst(0))
+ {
+ offsetReg = genConsumeReg(offsetNode);
+ }
+ else
+ {
+ assert(offsetNode->isContained());
+ }
+ regNumber indexReg = genConsumeReg(indexNode);
+ // Although arrReg may not be used in the constant-index case, if we have generated
+ // the value into a register, we must consume it, otherwise we will fail to end the
+ // live range of the gc ptr.
+ // TODO-CQ: Currently arrObj will always have a register allocated to it.
+ // We could avoid allocating a register for it, which would be of value if the arrObj
+ // is an on-stack lclVar.
+ regNumber arrReg = REG_NA;
+ if (arrObj->gtHasReg())
+ {
+ arrReg = genConsumeReg(arrObj);
+ }
+
+ if (!offsetNode->IsIntegralConst(0))
+ {
+ // Evaluate tgtReg = offsetReg*dim_size + indexReg.
+ // tmpReg is used to load dim_size and the result of the multiplication.
+ // Note that dim_size will never be negative.
+
+ getEmitter()->emitIns_R_AR(INS_mov, emitActualTypeSize(TYP_INT), tmpReg, arrReg,
+ genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
+ inst_RV_RV(INS_imul, tmpReg, offsetReg);
+
+ if (tmpReg == tgtReg)
+ {
+ inst_RV_RV(INS_add, tmpReg, indexReg);
+ }
+ else
+ {
+ if (indexReg != tgtReg)
+ {
+ inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_I_IMPL);
+ }
+ inst_RV_RV(INS_add, tgtReg, tmpReg);
+ }
+ }
+ else
+ {
+ if (indexReg != tgtReg)
+ {
+ inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_INT);
+ }
+ }
+ genProduceReg(arrOffset);
+}
+
+// make a temporary indir we can feed to pattern matching routines
+// in cases where we don't want to instantiate all the indirs that happen
+//
+GenTreeIndir CodeGen::indirForm(var_types type, GenTree* base)
+{
+ GenTreeIndir i(GT_IND, type, base, nullptr);
+ i.gtRegNum = REG_NA;
+ // has to be nonnull (because contained nodes can't be the last in block)
+ // but don't want it to be a valid pointer
+ i.gtNext = (GenTree*)(-1);
+ return i;
+}
+
+// make a temporary int we can feed to pattern matching routines
+// in cases where we don't want to instantiate
+//
+GenTreeIntCon CodeGen::intForm(var_types type, ssize_t value)
+{
+ GenTreeIntCon i(type, value);
+ i.gtRegNum = REG_NA;
+ // has to be nonnull (because contained nodes can't be the last in block)
+ // but don't want it to be a valid pointer
+ i.gtNext = (GenTree*)(-1);
+ return i;
+}
+
+instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type)
+{
+ instruction ins;
+
+ // Operations on SIMD vectors shouldn't come this path
+ assert(!varTypeIsSIMD(type));
+ if (varTypeIsFloating(type))
+ {
+ return ins_MathOp(oper, type);
+ }
+
+ switch (oper)
+ {
+ case GT_ADD:
+ ins = INS_add;
+ break;
+ case GT_AND:
+ ins = INS_and;
+ break;
+ case GT_LSH:
+ ins = INS_shl;
+ break;
+ case GT_MUL:
+ ins = INS_imul;
+ break;
+ case GT_NEG:
+ ins = INS_neg;
+ break;
+ case GT_NOT:
+ ins = INS_not;
+ break;
+ case GT_OR:
+ ins = INS_or;
+ break;
+ case GT_ROL:
+ ins = INS_rol;
+ break;
+ case GT_ROR:
+ ins = INS_ror;
+ break;
+ case GT_RSH:
+ ins = INS_sar;
+ break;
+ case GT_RSZ:
+ ins = INS_shr;
+ break;
+ case GT_SUB:
+ ins = INS_sub;
+ break;
+ case GT_XOR:
+ ins = INS_xor;
+ break;
+#if !defined(_TARGET_64BIT_)
+ case GT_ADD_LO:
+ ins = INS_add;
+ break;
+ case GT_ADD_HI:
+ ins = INS_adc;
+ break;
+ case GT_SUB_LO:
+ ins = INS_sub;
+ break;
+ case GT_SUB_HI:
+ ins = INS_sbb;
+ break;
+#endif // !defined(_TARGET_64BIT_)
+ default:
+ unreached();
+ break;
+ }
+ return ins;
+}
+
+//------------------------------------------------------------------------
+// genCodeForShift: Generates the code sequence for a GenTree node that
+// represents a bit shift or rotate operation (<<, >>, >>>, rol, ror).
+//
+// Arguments:
+// tree - the bit shift node (that specifies the type of bit shift to perform).
+//
+// Assumptions:
+// a) All GenTrees are register allocated.
+// b) The shift-by-amount in tree->gtOp.gtOp2 is either a contained constant or
+// it's a register-allocated expression. If it is in a register that is
+// not RCX, it will be moved to RCX (so RCX better not be in use!).
+//
+void CodeGen::genCodeForShift(GenTreePtr tree)
+{
+ // Only the non-RMW case here.
+ assert(tree->OperIsShiftOrRotate());
+ assert(!tree->gtOp.gtOp1->isContained());
+ assert(tree->gtRegNum != REG_NA);
+
+ genConsumeOperands(tree->AsOp());
+
+ var_types targetType = tree->TypeGet();
+ instruction ins = genGetInsForOper(tree->OperGet(), targetType);
+
+ GenTreePtr operand = tree->gtGetOp1();
+ regNumber operandReg = operand->gtRegNum;
+
+ GenTreePtr shiftBy = tree->gtGetOp2();
+ if (shiftBy->isContainedIntOrIImmed())
+ {
+ // First, move the operand to the destination register and
+ // later on perform the shift in-place.
+ // (LSRA will try to avoid this situation through preferencing.)
+ if (tree->gtRegNum != operandReg)
+ {
+ inst_RV_RV(INS_mov, tree->gtRegNum, operandReg, targetType);
+ }
+
+ int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
+ inst_RV_SH(ins, emitTypeSize(tree), tree->gtRegNum, shiftByValue);
+ }
+ else
+ {
+ // We must have the number of bits to shift stored in ECX, since we constrained this node to
+ // sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
+ // register destination requirement.
+ regNumber shiftReg = shiftBy->gtRegNum;
+ if (shiftReg != REG_RCX)
+ {
+ // Issue the mov to RCX:
+ inst_RV_RV(INS_mov, REG_RCX, shiftReg, shiftBy->TypeGet());
+ }
+
+ // The operand to be shifted must not be in ECX
+ noway_assert(operandReg != REG_RCX);
+
+ if (tree->gtRegNum != operandReg)
+ {
+ inst_RV_RV(INS_mov, tree->gtRegNum, operandReg, targetType);
+ }
+ inst_RV_CL(ins, tree->gtRegNum, targetType);
+ }
+
+ genProduceReg(tree);
+}
+
+//------------------------------------------------------------------------
+// genCodeForShiftRMW: Generates the code sequence for a GT_STOREIND GenTree node that
+// represents a RMW bit shift or rotate operation (<<, >>, >>>, rol, ror), for example:
+// GT_STOREIND( AddressTree, GT_SHL( Ind ( AddressTree ), Operand ) )
+//
+// Arguments:
+// storeIndNode: the GT_STOREIND node.
+//
+void CodeGen::genCodeForShiftRMW(GenTreeStoreInd* storeInd)
+{
+ GenTree* data = storeInd->Data();
+ GenTree* addr = storeInd->Addr();
+
+ assert(data->OperIsShiftOrRotate());
+
+ // This function only handles the RMW case.
+ assert(data->gtOp.gtOp1->isContained());
+ assert(data->gtOp.gtOp1->isIndir());
+ assert(Lowering::IndirsAreEquivalent(data->gtOp.gtOp1, storeInd));
+ assert(data->gtRegNum == REG_NA);
+
+ var_types targetType = data->TypeGet();
+ genTreeOps oper = data->OperGet();
+ instruction ins = genGetInsForOper(oper, targetType);
+ emitAttr attr = EA_ATTR(genTypeSize(targetType));
+
+ GenTree* shiftBy = data->gtOp.gtOp2;
+ if (shiftBy->isContainedIntOrIImmed())
+ {
+ int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
+ ins = genMapShiftInsToShiftByConstantIns(ins, shiftByValue);
+ if (shiftByValue == 1)
+ {
+ // There is no source in this case, as the shift by count is embedded in the instruction opcode itself.
+ getEmitter()->emitInsRMW(ins, attr, storeInd);
+ }
+ else
+ {
+ getEmitter()->emitInsRMW(ins, attr, storeInd, shiftBy);
+ }
+ }
+ else
+ {
+ // We must have the number of bits to shift stored in ECX, since we constrained this node to
+ // sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
+ // register destination requirement.
+ regNumber shiftReg = shiftBy->gtRegNum;
+ if (shiftReg != REG_RCX)
+ {
+ // Issue the mov to RCX:
+ inst_RV_RV(INS_mov, REG_RCX, shiftReg, shiftBy->TypeGet());
+ }
+
+ // The shiftBy operand is implicit, so call the unary version of emitInsRMW.
+ getEmitter()->emitInsRMW(ins, attr, storeInd);
+ }
+}
+
+void CodeGen::genUnspillRegIfNeeded(GenTree* tree)
+{
+ regNumber dstReg = tree->gtRegNum;
+ GenTree* unspillTree = tree;
+
+ if (tree->gtOper == GT_RELOAD)
+ {
+ unspillTree = tree->gtOp.gtOp1;
+ }
+
+ if ((unspillTree->gtFlags & GTF_SPILLED) != 0)
+ {
+ if (genIsRegCandidateLocal(unspillTree))
+ {
+ // Reset spilled flag, since we are going to load a local variable from its home location.
+ unspillTree->gtFlags &= ~GTF_SPILLED;
+
+ GenTreeLclVarCommon* lcl = unspillTree->AsLclVarCommon();
+ LclVarDsc* varDsc = &compiler->lvaTable[lcl->gtLclNum];
+
+ // Load local variable from its home location.
+ // In most cases the tree type will indicate the correct type to use for the load.
+ // However, if it is NOT a normalizeOnLoad lclVar (i.e. NOT a small int that always gets
+ // widened when loaded into a register), and its size is not the same as genActualType of
+ // the type of the lclVar, then we need to change the type of the tree node when loading.
+ // This situation happens due to "optimizations" that avoid a cast and
+ // simply retype the node when using long type lclVar as an int.
+ // While loading the int in that case would work for this use of the lclVar, if it is
+ // later used as a long, we will have incorrectly truncated the long.
+ // In the normalizeOnLoad case ins_Load will return an appropriate sign- or zero-
+ // extending load.
+
+ var_types treeType = unspillTree->TypeGet();
+ if (treeType != genActualType(varDsc->lvType) && !varTypeIsGC(treeType) && !varDsc->lvNormalizeOnLoad())
+ {
+ assert(!varTypeIsGC(varDsc));
+ var_types spillType = genActualType(varDsc->lvType);
+ unspillTree->gtType = spillType;
+ inst_RV_TT(ins_Load(spillType, compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)), dstReg, unspillTree);
+ unspillTree->gtType = treeType;
+ }
+ else
+ {
+ inst_RV_TT(ins_Load(treeType, compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)), dstReg, unspillTree);
+ }
+
+ unspillTree->SetInReg();
+
+ // TODO-Review: We would like to call:
+ // genUpdateRegLife(varDsc, /*isBorn*/ true, /*isDying*/ false DEBUGARG(tree));
+ // instead of the following code, but this ends up hitting this assert:
+ // assert((regSet.rsMaskVars & regMask) == 0);
+ // due to issues with LSRA resolution moves.
+ // So, just force it for now. This probably indicates a condition that creates a GC hole!
+ //
+ // Extra note: I think we really want to call something like gcInfo.gcUpdateForRegVarMove,
+ // because the variable is not really going live or dead, but that method is somewhat poorly
+ // factored because it, in turn, updates rsMaskVars which is part of RegSet not GCInfo.
+ // TODO-Cleanup: This code exists in other CodeGen*.cpp files, and should be moved to CodeGenCommon.cpp.
+
+ // Don't update the variable's location if we are just re-spilling it again.
+
+ if ((unspillTree->gtFlags & GTF_SPILL) == 0)
+ {
+ genUpdateVarReg(varDsc, tree);
+#ifdef DEBUG
+ if (VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
+ {
+ JITDUMP("\t\t\t\t\t\t\tRemoving V%02u from gcVarPtrSetCur\n", lcl->gtLclNum);
+ }
+#endif // DEBUG
+ VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+
+#ifdef DEBUG
+ if (compiler->verbose)
+ {
+ printf("\t\t\t\t\t\t\tV%02u in reg ", lcl->gtLclNum);
+ varDsc->PrintVarReg();
+ printf(" is becoming live ");
+ compiler->printTreeID(unspillTree);
+ printf("\n");
+ }
+#endif // DEBUG
+
+ regSet.AddMaskVars(genGetRegMask(varDsc));
+ }
+
+ gcInfo.gcMarkRegPtrVal(dstReg, unspillTree->TypeGet());
+ }
+ else if (unspillTree->IsMultiRegCall())
+ {
+ GenTreeCall* call = unspillTree->AsCall();
+ ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+ unsigned regCount = retTypeDesc->GetReturnRegCount();
+ GenTreeCopyOrReload* reloadTree = nullptr;
+ if (tree->OperGet() == GT_RELOAD)
+ {
+ reloadTree = tree->AsCopyOrReload();
+ }
+
+ // In case of multi-reg call node, GTF_SPILLED flag on it indicates that
+ // one or more of its result regs are spilled. Call node needs to be
+ // queried to know which specific result regs to be unspilled.
+ for (unsigned i = 0; i < regCount; ++i)
+ {
+ unsigned flags = call->GetRegSpillFlagByIdx(i);
+ if ((flags & GTF_SPILLED) != 0)
+ {
+ var_types dstType = retTypeDesc->GetReturnRegType(i);
+ regNumber unspillTreeReg = call->GetRegNumByIdx(i);
+
+ if (reloadTree != nullptr)
+ {
+ dstReg = reloadTree->GetRegNumByIdx(i);
+ if (dstReg == REG_NA)
+ {
+ dstReg = unspillTreeReg;
+ }
+ }
+ else
+ {
+ dstReg = unspillTreeReg;
+ }
+
+ TempDsc* t = regSet.rsUnspillInPlace(call, unspillTreeReg, i);
+ getEmitter()->emitIns_R_S(ins_Load(dstType), emitActualTypeSize(dstType), dstReg, t->tdTempNum(),
+ 0);
+ compiler->tmpRlsTemp(t);
+ gcInfo.gcMarkRegPtrVal(dstReg, dstType);
+ }
+ }
+
+ unspillTree->gtFlags &= ~GTF_SPILLED;
+ unspillTree->SetInReg();
+ }
+ else
+ {
+ TempDsc* t = regSet.rsUnspillInPlace(unspillTree, unspillTree->gtRegNum);
+ getEmitter()->emitIns_R_S(ins_Load(unspillTree->gtType), emitActualTypeSize(unspillTree->TypeGet()), dstReg,
+ t->tdTempNum(), 0);
+ compiler->tmpRlsTemp(t);
+
+ unspillTree->gtFlags &= ~GTF_SPILLED;
+ unspillTree->SetInReg();
+ gcInfo.gcMarkRegPtrVal(dstReg, unspillTree->TypeGet());
+ }
+ }
+}
+
+// Do Liveness update for a subnodes that is being consumed by codegen
+// including the logic for reload in case is needed and also takes care
+// of locating the value on the desired register.
+void CodeGen::genConsumeRegAndCopy(GenTree* tree, regNumber needReg)
+{
+ if (needReg == REG_NA)
+ {
+ return;
+ }
+ regNumber treeReg = genConsumeReg(tree);
+ if (treeReg != needReg)
+ {
+ inst_RV_RV(INS_mov, needReg, treeReg, tree->TypeGet());
+ }
+}
+
+void CodeGen::genRegCopy(GenTree* treeNode)
+{
+ assert(treeNode->OperGet() == GT_COPY);
+ GenTree* op1 = treeNode->gtOp.gtOp1;
+
+ if (op1->IsMultiRegCall())
+ {
+ genConsumeReg(op1);
+
+ GenTreeCopyOrReload* copyTree = treeNode->AsCopyOrReload();
+ GenTreeCall* call = op1->AsCall();
+ ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+ unsigned regCount = retTypeDesc->GetReturnRegCount();
+
+ for (unsigned i = 0; i < regCount; ++i)
+ {
+ var_types type = retTypeDesc->GetReturnRegType(i);
+ regNumber fromReg = call->GetRegNumByIdx(i);
+ regNumber toReg = copyTree->GetRegNumByIdx(i);
+
+ // A Multi-reg GT_COPY node will have valid reg only for those
+ // positions that corresponding result reg of call node needs
+ // to be copied.
+ if (toReg != REG_NA)
+ {
+ assert(toReg != fromReg);
+ inst_RV_RV(ins_Copy(type), toReg, fromReg, type);
+ }
+ }
+ }
+ else
+ {
+ var_types targetType = treeNode->TypeGet();
+ regNumber targetReg = treeNode->gtRegNum;
+ assert(targetReg != REG_NA);
+
+ // Check whether this node and the node from which we're copying the value have
+ // different register types. This can happen if (currently iff) we have a SIMD
+ // vector type that fits in an integer register, in which case it is passed as
+ // an argument, or returned from a call, in an integer register and must be
+ // copied if it's in an xmm register.
+
+ bool srcFltReg = (varTypeIsFloating(op1) || varTypeIsSIMD(op1));
+ bool tgtFltReg = (varTypeIsFloating(treeNode) || varTypeIsSIMD(treeNode));
+ if (srcFltReg != tgtFltReg)
+ {
+ instruction ins;
+ regNumber fpReg;
+ regNumber intReg;
+ if (tgtFltReg)
+ {
+ ins = ins_CopyIntToFloat(op1->TypeGet(), treeNode->TypeGet());
+ fpReg = targetReg;
+ intReg = op1->gtRegNum;
+ }
+ else
+ {
+ ins = ins_CopyFloatToInt(op1->TypeGet(), treeNode->TypeGet());
+ intReg = targetReg;
+ fpReg = op1->gtRegNum;
+ }
+ inst_RV_RV(ins, fpReg, intReg, targetType);
+ }
+ else
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, genConsumeReg(op1), targetType);
+ }
+
+ if (op1->IsLocal())
+ {
+ // The lclVar will never be a def.
+ // If it is a last use, the lclVar will be killed by genConsumeReg(), as usual, and genProduceReg will
+ // appropriately set the gcInfo for the copied value.
+ // If not, there are two cases we need to handle:
+ // - If this is a TEMPORARY copy (indicated by the GTF_VAR_DEATH flag) the variable
+ // will remain live in its original register.
+ // genProduceReg() will appropriately set the gcInfo for the copied value,
+ // and genConsumeReg will reset it.
+ // - Otherwise, we need to update register info for the lclVar.
+
+ GenTreeLclVarCommon* lcl = op1->AsLclVarCommon();
+ assert((lcl->gtFlags & GTF_VAR_DEF) == 0);
+
+ if ((lcl->gtFlags & GTF_VAR_DEATH) == 0 && (treeNode->gtFlags & GTF_VAR_DEATH) == 0)
+ {
+ LclVarDsc* varDsc = &compiler->lvaTable[lcl->gtLclNum];
+
+ // If we didn't just spill it (in genConsumeReg, above), then update the register info
+ if (varDsc->lvRegNum != REG_STK)
+ {
+ // The old location is dying
+ genUpdateRegLife(varDsc, /*isBorn*/ false, /*isDying*/ true DEBUGARG(op1));
+
+ gcInfo.gcMarkRegSetNpt(genRegMask(op1->gtRegNum));
+
+ genUpdateVarReg(varDsc, treeNode);
+
+ // The new location is going live
+ genUpdateRegLife(varDsc, /*isBorn*/ true, /*isDying*/ false DEBUGARG(treeNode));
+ }
+ }
+ }
+ }
+
+ genProduceReg(treeNode);
+}
+
+// Check that registers are consumed in the right order for the current node being generated.
+#ifdef DEBUG
+void CodeGen::genCheckConsumeNode(GenTree* treeNode)
+{
+ // GT_PUTARG_REG is consumed out of order.
+ if (treeNode->gtSeqNum != 0 && treeNode->OperGet() != GT_PUTARG_REG)
+ {
+ if (lastConsumedNode != nullptr)
+ {
+ if (treeNode == lastConsumedNode)
+ {
+ if (verbose)
+ {
+ printf("Node was consumed twice:\n ");
+ compiler->gtDispTree(treeNode, nullptr, nullptr, true);
+ }
+ }
+ else
+ {
+ if (verbose && (lastConsumedNode->gtSeqNum > treeNode->gtSeqNum))
+ {
+ printf("Nodes were consumed out-of-order:\n");
+ compiler->gtDispTree(lastConsumedNode, nullptr, nullptr, true);
+ compiler->gtDispTree(treeNode, nullptr, nullptr, true);
+ }
+ // assert(lastConsumedNode->gtSeqNum < treeNode->gtSeqNum);
+ }
+ }
+ lastConsumedNode = treeNode;
+ }
+}
+#endif // DEBUG
+
+//--------------------------------------------------------------------
+// genConsumeReg: Do liveness update for a subnode that is being
+// consumed by codegen.
+//
+// Arguments:
+// tree - GenTree node
+//
+// Return Value:
+// Returns the reg number of tree.
+// In case of multi-reg call node returns the first reg number
+// of the multi-reg return.
+regNumber CodeGen::genConsumeReg(GenTree* tree)
+{
+ if (tree->OperGet() == GT_COPY)
+ {
+ genRegCopy(tree);
+ }
+
+ // Handle the case where we have a lclVar that needs to be copied before use (i.e. because it
+ // interferes with one of the other sources (or the target, if it's a "delayed use" register)).
+ // TODO-Cleanup: This is a special copyReg case in LSRA - consider eliminating these and
+ // always using GT_COPY to make the lclVar location explicit.
+ // Note that we have to do this before calling genUpdateLife because otherwise if we spill it
+ // the lvRegNum will be set to REG_STK and we will lose track of what register currently holds
+ // the lclVar (normally when a lclVar is spilled it is then used from its former register
+ // location, which matches the gtRegNum on the node).
+ // (Note that it doesn't matter if we call this before or after genUnspillRegIfNeeded
+ // because if it's on the stack it will always get reloaded into tree->gtRegNum).
+ if (genIsRegCandidateLocal(tree))
+ {
+ GenTreeLclVarCommon* lcl = tree->AsLclVarCommon();
+ LclVarDsc* varDsc = &compiler->lvaTable[lcl->GetLclNum()];
+ if (varDsc->lvRegNum != REG_STK && varDsc->lvRegNum != tree->gtRegNum)
+ {
+ inst_RV_RV(INS_mov, tree->gtRegNum, varDsc->lvRegNum);
+ }
+ }
+
+ genUnspillRegIfNeeded(tree);
+
+ // genUpdateLife() will also spill local var if marked as GTF_SPILL by calling CodeGen::genSpillVar
+ genUpdateLife(tree);
+
+ assert(tree->gtHasReg());
+
+ // there are three cases where consuming a reg means clearing the bit in the live mask
+ // 1. it was not produced by a local
+ // 2. it was produced by a local that is going dead
+ // 3. it was produced by a local that does not live in that reg (like one allocated on the stack)
+
+ if (genIsRegCandidateLocal(tree))
+ {
+ GenTreeLclVarCommon* lcl = tree->AsLclVarCommon();
+ LclVarDsc* varDsc = &compiler->lvaTable[lcl->GetLclNum()];
+ assert(varDsc->lvLRACandidate);
+
+ if ((tree->gtFlags & GTF_VAR_DEATH) != 0)
+ {
+ gcInfo.gcMarkRegSetNpt(genRegMask(varDsc->lvRegNum));
+ }
+ else if (varDsc->lvRegNum == REG_STK)
+ {
+ // We have loaded this into a register only temporarily
+ gcInfo.gcMarkRegSetNpt(genRegMask(tree->gtRegNum));
+ }
+ }
+ else
+ {
+ gcInfo.gcMarkRegSetNpt(tree->gtGetRegMask());
+ }
+
+ genCheckConsumeNode(tree);
+ return tree->gtRegNum;
+}
+
+// Do liveness update for an address tree: one of GT_LEA, GT_LCL_VAR, or GT_CNS_INT (for call indirect).
+void CodeGen::genConsumeAddress(GenTree* addr)
+{
+ if (!addr->isContained())
+ {
+ genConsumeReg(addr);
+ }
+ else if (addr->OperGet() == GT_LEA)
+ {
+ genConsumeAddrMode(addr->AsAddrMode());
+ }
+}
+
+// do liveness update for a subnode that is being consumed by codegen
+void CodeGen::genConsumeAddrMode(GenTreeAddrMode* addr)
+{
+ genConsumeOperands(addr);
+}
+
+void CodeGen::genConsumeRegs(GenTree* tree)
+{
+#if !defined(_TARGET_64BIT_)
+ if (tree->OperGet() == GT_LONG)
+ {
+ genConsumeRegs(tree->gtGetOp1());
+ genConsumeRegs(tree->gtGetOp2());
+ return;
+ }
+#endif // !defined(_TARGET_64BIT_)
+
+ if (tree->isContained())
+ {
+ if (tree->isContainedSpillTemp())
+ {
+ // spill temps are un-tracked and hence no need to update life
+ }
+ else if (tree->isIndir())
+ {
+ genConsumeAddress(tree->AsIndir()->Addr());
+ }
+ else if (tree->OperGet() == GT_AND)
+ {
+ // This is the special contained GT_AND that we created in Lowering::LowerCmp()
+ // Now we need to consume the operands of the GT_AND node.
+ genConsumeOperands(tree->AsOp());
+ }
+ else if (tree->OperGet() == GT_LCL_VAR)
+ {
+ // A contained lcl var must be living on stack and marked as reg optional.
+ unsigned varNum = tree->AsLclVarCommon()->GetLclNum();
+ LclVarDsc* varDsc = compiler->lvaTable + varNum;
+
+ noway_assert(varDsc->lvRegNum == REG_STK);
+ noway_assert(tree->IsRegOptional());
+
+ // Update the life of reg optional lcl var.
+ genUpdateLife(tree);
+ }
+ else
+ {
+ assert(tree->OperIsLeaf());
+ }
+ }
+ else
+ {
+ genConsumeReg(tree);
+ }
+}
+
+//------------------------------------------------------------------------
+// genConsumeOperands: Do liveness update for the operands of a unary or binary tree
+//
+// Arguments:
+// tree - the GenTreeOp whose operands will have their liveness updated.
+//
+// Return Value:
+// None.
+//
+// Notes:
+// Note that this logic is localized here because we must do the liveness update in
+// the correct execution order. This is important because we may have two operands
+// that involve the same lclVar, and if one is marked "lastUse" we must handle it
+// after the first.
+
+void CodeGen::genConsumeOperands(GenTreeOp* tree)
+{
+ GenTree* firstOp = tree->gtOp1;
+ GenTree* secondOp = tree->gtOp2;
+ if ((tree->gtFlags & GTF_REVERSE_OPS) != 0)
+ {
+ assert(secondOp != nullptr);
+ firstOp = secondOp;
+ secondOp = tree->gtOp1;
+ }
+ if (firstOp != nullptr)
+ {
+ genConsumeRegs(firstOp);
+ }
+ if (secondOp != nullptr)
+ {
+ genConsumeRegs(secondOp);
+ }
+}
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+//------------------------------------------------------------------------
+// genConsumePutStructArgStk: Do liveness update for the operands of a PutArgStk node.
+// Also loads in the right register the addresses of the
+// src/dst for rep mov operation.
+//
+// Arguments:
+// putArgNode - the PUTARG_STK tree.
+// dstReg - the dstReg for the rep move operation.
+// srcReg - the srcReg for the rep move operation.
+// sizeReg - the sizeReg for the rep move operation.
+// baseVarNum - the varnum for the local used for placing the "by-value" args on the stack.
+//
+// Return Value:
+// None.
+//
+// Note: sizeReg can be REG_NA when this function is used to consume the dstReg and srcReg
+// for copying on the stack a struct with references.
+// The source address/offset is determined from the address on the GT_OBJ node, while
+// the destination address is the address contained in 'baseVarNum' plus the offset
+// provided in the 'putArgNode'.
+
+void CodeGen::genConsumePutStructArgStk(
+ GenTreePutArgStk* putArgNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg, unsigned baseVarNum)
+{
+ assert(varTypeIsStruct(putArgNode));
+ assert(baseVarNum != BAD_VAR_NUM);
+
+ // The putArgNode children are always contained. We should not consume any registers.
+ assert(putArgNode->gtGetOp1()->isContained());
+
+ GenTree* dstAddr = putArgNode;
+
+ // Get the source address.
+ GenTree* src = putArgNode->gtGetOp1();
+ assert((src->gtOper == GT_OBJ) || ((src->gtOper == GT_IND && varTypeIsSIMD(src))));
+ GenTree* srcAddr = src->gtGetOp1();
+
+ size_t size = putArgNode->getArgSize();
+
+ assert(dstReg != REG_NA);
+ assert(srcReg != REG_NA);
+
+ // Consume the registers only if they are not contained or set to REG_NA.
+ if (srcAddr->gtRegNum != REG_NA)
+ {
+ genConsumeReg(srcAddr);
+ }
+
+ // If the op1 is already in the dstReg - nothing to do.
+ // Otherwise load the op1 (GT_ADDR) into the dstReg to copy the struct on the stack by value.
+ if (dstAddr->gtRegNum != dstReg)
+ {
+ // Generate LEA instruction to load the stack of the outgoing var + SlotNum offset (or the incoming arg area
+ // for tail calls) in RDI.
+ // Destination is always local (on the stack) - use EA_PTRSIZE.
+ getEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, dstReg, baseVarNum, putArgNode->getArgOffset());
+ }
+
+ if (srcAddr->gtRegNum != srcReg)
+ {
+ if (srcAddr->OperIsLocalAddr())
+ {
+ // The OperLocalAddr is always contained.
+ assert(srcAddr->isContained());
+ GenTreeLclVarCommon* lclNode = srcAddr->AsLclVarCommon();
+
+ // Generate LEA instruction to load the LclVar address in RSI.
+ // Source is known to be on the stack. Use EA_PTRSIZE.
+ unsigned int offset = 0;
+ if (srcAddr->OperGet() == GT_LCL_FLD_ADDR)
+ {
+ offset = srcAddr->AsLclFld()->gtLclOffs;
+ }
+ getEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, srcReg, lclNode->gtLclNum, offset);
+ }
+ else
+ {
+ assert(srcAddr->gtRegNum != REG_NA);
+ // Source is not known to be on the stack. Use EA_BYREF.
+ getEmitter()->emitIns_R_R(INS_mov, EA_BYREF, srcReg, srcAddr->gtRegNum);
+ }
+ }
+
+ if (sizeReg != REG_NA)
+ {
+ inst_RV_IV(INS_mov, sizeReg, size, EA_8BYTE);
+ }
+}
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+//------------------------------------------------------------------------
+// genConsumeBlockSize: Ensure that the block size is in the given register
+//
+// Arguments:
+// blkNode - The block node
+// sizeReg - The register into which the block's size should go
+//
+
+void CodeGen::genConsumeBlockSize(GenTreeBlk* blkNode, regNumber sizeReg)
+{
+ if (sizeReg != REG_NA)
+ {
+ unsigned blockSize = blkNode->Size();
+ if (blockSize != 0)
+ {
+ assert(blkNode->gtRsvdRegs == genRegMask(sizeReg));
+ genSetRegToIcon(sizeReg, blockSize);
+ }
+ else
+ {
+ noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK);
+ genConsumeReg(blkNode->AsDynBlk()->gtDynamicSize);
+ }
+ }
+}
+
+//------------------------------------------------------------------------
+// genConsumeBlockDst: Ensure that the block destination address is in its
+// allocated register.
+// Arguments:
+// blkNode - The block node
+//
+
+void CodeGen::genConsumeBlockDst(GenTreeBlk* blkNode)
+{
+ GenTree* dstAddr = blkNode->Addr();
+ genConsumeReg(dstAddr);
+}
+
+//------------------------------------------------------------------------
+// genConsumeBlockSrc: Ensure that the block source address is in its
+// allocated register if it is non-local.
+// Arguments:
+// blkNode - The block node
+//
+// Return Value:
+// Returns the source address node, if it is non-local,
+// and nullptr otherwise.
+
+GenTree* CodeGen::genConsumeBlockSrc(GenTreeBlk* blkNode)
+{
+ GenTree* src = blkNode->Data();
+ if (blkNode->OperIsCopyBlkOp())
+ {
+ // For a CopyBlk we need the address of the source.
+ if (src->OperGet() == GT_IND)
+ {
+ src = src->gtOp.gtOp1;
+ }
+ else
+ {
+ // This must be a local.
+ // For this case, there is no source address register, as it is a
+ // stack-based address.
+ assert(src->OperIsLocal());
+ return nullptr;
+ }
+ }
+ genConsumeReg(src);
+ return src;
+}
+
+//------------------------------------------------------------------------
+// genConsumeBlockOp: Ensure that the block's operands are enregistered
+// as needed.
+// Arguments:
+// blkNode - The block node
+//
+// Notes:
+// This ensures that the operands are consumed in the proper order to
+// obey liveness modeling.
+
+void CodeGen::genConsumeBlockOp(GenTreeBlk* blkNode, regNumber dstReg, regNumber srcReg, regNumber sizeReg)
+{
+ // We have to consume the registers, and perform any copies, in the actual execution order.
+ // The nominal order is: dst, src, size. However this may have been changed
+ // with reverse flags on the blkNode and the setting of gtEvalSizeFirst in the case of a dynamic
+ // block size.
+ // Note that the register allocator ensures that the registers ON THE NODES will not interfere
+ // with one another if consumed (i.e. reloaded or moved to their ASSIGNED reg) in execution order.
+ // Further, it ensures that they will not interfere with one another if they are then copied
+ // to the REQUIRED register (if a fixed register requirement) in execution order. This requires,
+ // then, that we first consume all the operands, then do any necessary moves.
+
+ GenTree* dstAddr = blkNode->Addr();
+ GenTree* src = nullptr;
+ unsigned blockSize = blkNode->Size();
+ GenTree* size = nullptr;
+ bool evalSizeFirst = true;
+
+ if (blkNode->OperGet() == GT_STORE_DYN_BLK)
+ {
+ evalSizeFirst = blkNode->AsDynBlk()->gtEvalSizeFirst;
+ size = blkNode->AsDynBlk()->gtDynamicSize;
+ }
+
+ // First, consusme all the sources in order
+ if (evalSizeFirst)
+ {
+ genConsumeBlockSize(blkNode, sizeReg);
+ }
+ if (blkNode->IsReverseOp())
+ {
+ src = genConsumeBlockSrc(blkNode);
+ genConsumeBlockDst(blkNode);
+ }
+ else
+ {
+ genConsumeBlockDst(blkNode);
+ src = genConsumeBlockSrc(blkNode);
+ }
+ if (!evalSizeFirst)
+ {
+ genConsumeBlockSize(blkNode, sizeReg);
+ }
+ // Next, perform any necessary moves.
+ if (evalSizeFirst && (size != nullptr) && (size->gtRegNum != sizeReg))
+ {
+ inst_RV_RV(INS_mov, sizeReg, size->gtRegNum, size->TypeGet());
+ }
+ if (blkNode->IsReverseOp())
+ {
+ if ((src != nullptr) && (src->gtRegNum != srcReg))
+ {
+ inst_RV_RV(INS_mov, srcReg, src->gtRegNum, src->TypeGet());
+ }
+ if (dstAddr->gtRegNum != dstReg)
+ {
+ inst_RV_RV(INS_mov, dstReg, dstAddr->gtRegNum, dstAddr->TypeGet());
+ }
+ }
+ else
+ {
+ if (dstAddr->gtRegNum != dstReg)
+ {
+ inst_RV_RV(INS_mov, dstReg, dstAddr->gtRegNum, dstAddr->TypeGet());
+ }
+ if ((src != nullptr) && (src->gtRegNum != srcReg))
+ {
+ inst_RV_RV(INS_mov, srcReg, src->gtRegNum, src->TypeGet());
+ }
+ }
+ if (!evalSizeFirst && size != nullptr && (size->gtRegNum != sizeReg))
+ {
+ inst_RV_RV(INS_mov, sizeReg, size->gtRegNum, size->TypeGet());
+ }
+}
+
+//-------------------------------------------------------------------------
+// genProduceReg: do liveness update for register produced by the current
+// node in codegen.
+//
+// Arguments:
+// tree - Gentree node
+//
+// Return Value:
+// None.
+void CodeGen::genProduceReg(GenTree* tree)
+{
+ if (tree->gtFlags & GTF_SPILL)
+ {
+ // Code for GT_COPY node gets generated as part of consuming regs by its parent.
+ // A GT_COPY node in turn produces reg result and it should never be marked to
+ // spill.
+ //
+ // Similarly GT_RELOAD node gets generated as part of consuming regs by its
+ // parent and should never be marked for spilling.
+ noway_assert(!tree->IsCopyOrReload());
+
+ if (genIsRegCandidateLocal(tree))
+ {
+ // Store local variable to its home location.
+ tree->gtFlags &= ~GTF_REG_VAL;
+ // Ensure that lclVar stores are typed correctly.
+ unsigned varNum = tree->gtLclVarCommon.gtLclNum;
+ assert(!compiler->lvaTable[varNum].lvNormalizeOnStore() ||
+ (tree->TypeGet() == genActualType(compiler->lvaTable[varNum].TypeGet())));
+ inst_TT_RV(ins_Store(tree->gtType, compiler->isSIMDTypeLocalAligned(varNum)), tree, tree->gtRegNum);
+ }
+ else
+ {
+ // In case of multi-reg call node, spill flag on call node
+ // indicates that one or more of its allocated regs need to
+ // be spilled. Call node needs to be further queried to
+ // know which of its result regs needs to be spilled.
+ if (tree->IsMultiRegCall())
+ {
+ GenTreeCall* call = tree->AsCall();
+ ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+ unsigned regCount = retTypeDesc->GetReturnRegCount();
+
+ for (unsigned i = 0; i < regCount; ++i)
+ {
+ unsigned flags = call->GetRegSpillFlagByIdx(i);
+ if ((flags & GTF_SPILL) != 0)
+ {
+ regNumber reg = call->GetRegNumByIdx(i);
+ call->SetInReg();
+ regSet.rsSpillTree(reg, call, i);
+ gcInfo.gcMarkRegSetNpt(genRegMask(reg));
+ }
+ }
+ }
+ else
+ {
+ tree->SetInReg();
+ regSet.rsSpillTree(tree->gtRegNum, tree);
+ gcInfo.gcMarkRegSetNpt(genRegMask(tree->gtRegNum));
+ }
+
+ tree->gtFlags |= GTF_SPILLED;
+ tree->gtFlags &= ~GTF_SPILL;
+
+ return;
+ }
+ }
+
+ genUpdateLife(tree);
+
+ // If we've produced a register, mark it as a pointer, as needed.
+ if (tree->gtHasReg())
+ {
+ // We only mark the register in the following cases:
+ // 1. It is not a register candidate local. In this case, we're producing a
+ // register from a local, but the local is not a register candidate. Thus,
+ // we must be loading it as a temp register, and any "last use" flag on
+ // the register wouldn't be relevant.
+ // 2. The register candidate local is going dead. There's no point to mark
+ // the register as live, with a GC pointer, if the variable is dead.
+ if (!genIsRegCandidateLocal(tree) || ((tree->gtFlags & GTF_VAR_DEATH) == 0))
+ {
+ // Multi-reg call node will produce more than one register result.
+ // Mark all the regs produced by call node.
+ if (tree->IsMultiRegCall())
+ {
+ GenTreeCall* call = tree->AsCall();
+ ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+ unsigned regCount = retTypeDesc->GetReturnRegCount();
+
+ for (unsigned i = 0; i < regCount; ++i)
+ {
+ regNumber reg = call->GetRegNumByIdx(i);
+ var_types type = retTypeDesc->GetReturnRegType(i);
+ gcInfo.gcMarkRegPtrVal(reg, type);
+ }
+ }
+ else if (tree->IsCopyOrReloadOfMultiRegCall())
+ {
+ // we should never see reload of multi-reg call here
+ // because GT_RELOAD gets generated in reg consuming path.
+ noway_assert(tree->OperGet() == GT_COPY);
+
+ // A multi-reg GT_COPY node produces those regs to which
+ // copy has taken place.
+ GenTreeCopyOrReload* copy = tree->AsCopyOrReload();
+ GenTreeCall* call = copy->gtGetOp1()->AsCall();
+ ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+ unsigned regCount = retTypeDesc->GetReturnRegCount();
+
+ for (unsigned i = 0; i < regCount; ++i)
+ {
+ var_types type = retTypeDesc->GetReturnRegType(i);
+ regNumber fromReg = call->GetRegNumByIdx(i);
+ regNumber toReg = copy->GetRegNumByIdx(i);
+
+ if (toReg != REG_NA)
+ {
+ gcInfo.gcMarkRegPtrVal(toReg, type);
+ }
+ }
+ }
+ else
+ {
+ gcInfo.gcMarkRegPtrVal(tree->gtRegNum, tree->TypeGet());
+ }
+ }
+ }
+ tree->SetInReg();
+}
+
+// transfer gc/byref status of src reg to dst reg
+void CodeGen::genTransferRegGCState(regNumber dst, regNumber src)
+{
+ regMaskTP srcMask = genRegMask(src);
+ regMaskTP dstMask = genRegMask(dst);
+
+ if (gcInfo.gcRegGCrefSetCur & srcMask)
+ {
+ gcInfo.gcMarkRegSetGCref(dstMask);
+ }
+ else if (gcInfo.gcRegByrefSetCur & srcMask)
+ {
+ gcInfo.gcMarkRegSetByref(dstMask);
+ }
+ else
+ {
+ gcInfo.gcMarkRegSetNpt(dstMask);
+ }
+}
+
+// generates an ip-relative call or indirect call via reg ('call reg')
+// pass in 'addr' for a relative call or 'base' for a indirect register call
+// methHnd - optional, only used for pretty printing
+// retSize - emitter type of return for GC purposes, should be EA_BYREF, EA_GCREF, or EA_PTRSIZE(not GC)
+void CodeGen::genEmitCall(int callType,
+ CORINFO_METHOD_HANDLE methHnd,
+ INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo) void* addr X86_ARG(ssize_t argSize),
+ emitAttr retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize),
+ IL_OFFSETX ilOffset,
+ regNumber base,
+ bool isJump,
+ bool isNoGC)
+{
+#if !defined(_TARGET_X86_)
+ ssize_t argSize = 0;
+#endif // !defined(_TARGET_X86_)
+ getEmitter()->emitIns_Call(emitter::EmitCallType(callType), methHnd, INDEBUG_LDISASM_COMMA(sigInfo) addr, argSize,
+ retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), gcInfo.gcVarPtrSetCur,
+ gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur, ilOffset, base, REG_NA, 0, 0, isJump,
+ emitter::emitNoGChelper(compiler->eeGetHelperNum(methHnd)));
+}
+
+// generates an indirect call via addressing mode (call []) given an indir node
+// methHnd - optional, only used for pretty printing
+// retSize - emitter type of return for GC purposes, should be EA_BYREF, EA_GCREF, or EA_PTRSIZE(not GC)
+void CodeGen::genEmitCall(int callType,
+ CORINFO_METHOD_HANDLE methHnd,
+ INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo) GenTreeIndir* indir X86_ARG(ssize_t argSize),
+ emitAttr retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize),
+ IL_OFFSETX ilOffset)
+{
+#if !defined(_TARGET_X86_)
+ ssize_t argSize = 0;
+#endif // !defined(_TARGET_X86_)
+ genConsumeAddress(indir->Addr());
+
+ getEmitter()->emitIns_Call(emitter::EmitCallType(callType), methHnd, INDEBUG_LDISASM_COMMA(sigInfo) nullptr,
+ argSize, retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize),
+ gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur, ilOffset,
+ indir->Base() ? indir->Base()->gtRegNum : REG_NA,
+ indir->Index() ? indir->Index()->gtRegNum : REG_NA, indir->Scale(), indir->Offset());
+}
+
+//------------------------------------------------------------------------
+// genStoreInd: Generate code for a GT_STOREIND node.
+//
+// Arguments:
+// treeNode - The GT_STOREIND node for which to generate code.
+//
+// Return Value:
+// none
+
+void CodeGen::genStoreInd(GenTreePtr node)
+{
+ assert(node->OperGet() == GT_STOREIND);
+
+#ifdef FEATURE_SIMD
+ // Storing Vector3 of size 12 bytes through indirection
+ if (node->TypeGet() == TYP_SIMD12)
+ {
+ genStoreIndTypeSIMD12(node);
+ return;
+ }
+#endif // FEATURE_SIMD
+
+ GenTreeStoreInd* storeInd = node->AsStoreInd();
+ GenTree* data = storeInd->Data();
+ GenTree* addr = storeInd->Addr();
+ var_types targetType = storeInd->TypeGet();
+
+ assert(!varTypeIsFloating(targetType) || (targetType == data->TypeGet()));
+
+ GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(storeInd, data);
+ if (writeBarrierForm != GCInfo::WBF_NoBarrier)
+ {
+ // data and addr must be in registers.
+ // Consume both registers so that any copies of interfering registers are taken care of.
+ genConsumeOperands(storeInd->AsOp());
+
+ if (genEmitOptimizedGCWriteBarrier(writeBarrierForm, addr, data))
+ {
+ return;
+ }
+
+ // At this point, we should not have any interference.
+ // That is, 'data' must not be in REG_ARG_0, as that is where 'addr' must go.
+ noway_assert(data->gtRegNum != REG_ARG_0);
+
+ // addr goes in REG_ARG_0
+ if (addr->gtRegNum != REG_ARG_0)
+ {
+ inst_RV_RV(INS_mov, REG_ARG_0, addr->gtRegNum, addr->TypeGet());
+ }
+
+ // data goes in REG_ARG_1
+ if (data->gtRegNum != REG_ARG_1)
+ {
+ inst_RV_RV(INS_mov, REG_ARG_1, data->gtRegNum, data->TypeGet());
+ }
+
+ genGCWriteBarrier(storeInd, writeBarrierForm);
+ }
+ else
+ {
+ bool reverseOps = ((storeInd->gtFlags & GTF_REVERSE_OPS) != 0);
+ bool dataIsUnary = false;
+ bool isRMWMemoryOp = storeInd->IsRMWMemoryOp();
+ GenTree* rmwSrc = nullptr;
+
+ // We must consume the operands in the proper execution order, so that liveness is
+ // updated appropriately.
+ if (!reverseOps)
+ {
+ genConsumeAddress(addr);
+ }
+
+ // If storeInd represents a RMW memory op then its data is a non-leaf node marked as contained
+ // and non-indir operand of data is the source of RMW memory op.
+ if (isRMWMemoryOp)
+ {
+ assert(data->isContained() && !data->OperIsLeaf());
+
+ GenTreePtr rmwDst = nullptr;
+
+ dataIsUnary = (GenTree::OperIsUnary(data->OperGet()) != 0);
+ if (!dataIsUnary)
+ {
+ if (storeInd->IsRMWDstOp1())
+ {
+ rmwDst = data->gtGetOp1();
+ rmwSrc = data->gtGetOp2();
+ }
+ else
+ {
+ assert(storeInd->IsRMWDstOp2());
+ rmwDst = data->gtGetOp2();
+ rmwSrc = data->gtGetOp1();
+ }
+
+ genConsumeRegs(rmwSrc);
+ }
+ else
+ {
+ // *(p) = oper *(p): Here addr = p, rmwsrc=rmwDst = *(p) i.e. GT_IND(p)
+ // For unary RMW ops, src and dst of RMW memory op is the same. Lower
+ // clears operand counts on rmwSrc and we don't need to perform a
+ // genConsumeReg() on it.
+ assert(storeInd->IsRMWDstOp1());
+ rmwSrc = data->gtGetOp1();
+ rmwDst = data->gtGetOp1();
+ assert(rmwSrc->isContained());
+ }
+
+ assert(rmwSrc != nullptr);
+ assert(rmwDst != nullptr);
+ assert(Lowering::IndirsAreEquivalent(rmwDst, storeInd));
+ }
+ else
+ {
+ genConsumeRegs(data);
+ }
+
+ if (reverseOps)
+ {
+ genConsumeAddress(addr);
+ }
+
+ if (isRMWMemoryOp)
+ {
+ if (dataIsUnary)
+ {
+ // generate code for unary RMW memory ops like neg/not
+ getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(storeInd),
+ storeInd);
+ }
+ else
+ {
+ if (data->OperIsShiftOrRotate())
+ {
+ // Generate code for shift RMW memory ops.
+ // The data address needs to be op1 (it must be [addr] = [addr] <shift> <amount>, not [addr] =
+ // <amount> <shift> [addr]).
+ assert(storeInd->IsRMWDstOp1());
+ assert(rmwSrc == data->gtGetOp2());
+ genCodeForShiftRMW(storeInd);
+ }
+ else
+ {
+ // generate code for remaining binary RMW memory ops like add/sub/and/or/xor
+ getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(storeInd),
+ storeInd, rmwSrc);
+ }
+ }
+ }
+ else
+ {
+ getEmitter()->emitInsMov(ins_Store(data->TypeGet()), emitTypeSize(storeInd), storeInd);
+ }
+ }
+}
+
+//------------------------------------------------------------------------
+// genEmitOptimizedGCWriteBarrier: Generate write barrier store using the optimized
+// helper functions.
+//
+// Arguments:
+// writeBarrierForm - the write barrier form to use
+// addr - the address at which to do the store
+// data - the data to store
+//
+// Return Value:
+// true if an optimized write barrier form was used, false if not. If this
+// function returns false, the caller must emit a "standard" write barrier.
+
+bool CodeGen::genEmitOptimizedGCWriteBarrier(GCInfo::WriteBarrierForm writeBarrierForm, GenTree* addr, GenTree* data)
+{
+ assert(writeBarrierForm != GCInfo::WBF_NoBarrier);
+
+#if defined(_TARGET_X86_) && NOGC_WRITE_BARRIERS
+ bool useOptimizedWriteBarriers = true;
+
+#ifdef DEBUG
+ useOptimizedWriteBarriers =
+ (writeBarrierForm != GCInfo::WBF_NoBarrier_CheckNotHeapInDebug); // This one is always a call to a C++ method.
+#endif
+
+ if (!useOptimizedWriteBarriers)
+ {
+ return false;
+ }
+
+ const static int regToHelper[2][8] = {
+ // If the target is known to be in managed memory
+ {
+ CORINFO_HELP_ASSIGN_REF_EAX, CORINFO_HELP_ASSIGN_REF_ECX, -1, CORINFO_HELP_ASSIGN_REF_EBX, -1,
+ CORINFO_HELP_ASSIGN_REF_EBP, CORINFO_HELP_ASSIGN_REF_ESI, CORINFO_HELP_ASSIGN_REF_EDI,
+ },
+
+ // Don't know if the target is in managed memory
+ {
+ CORINFO_HELP_CHECKED_ASSIGN_REF_EAX, CORINFO_HELP_CHECKED_ASSIGN_REF_ECX, -1,
+ CORINFO_HELP_CHECKED_ASSIGN_REF_EBX, -1, CORINFO_HELP_CHECKED_ASSIGN_REF_EBP,
+ CORINFO_HELP_CHECKED_ASSIGN_REF_ESI, CORINFO_HELP_CHECKED_ASSIGN_REF_EDI,
+ },
+ };
+
+ noway_assert(regToHelper[0][REG_EAX] == CORINFO_HELP_ASSIGN_REF_EAX);
+ noway_assert(regToHelper[0][REG_ECX] == CORINFO_HELP_ASSIGN_REF_ECX);
+ noway_assert(regToHelper[0][REG_EBX] == CORINFO_HELP_ASSIGN_REF_EBX);
+ noway_assert(regToHelper[0][REG_ESP] == -1);
+ noway_assert(regToHelper[0][REG_EBP] == CORINFO_HELP_ASSIGN_REF_EBP);
+ noway_assert(regToHelper[0][REG_ESI] == CORINFO_HELP_ASSIGN_REF_ESI);
+ noway_assert(regToHelper[0][REG_EDI] == CORINFO_HELP_ASSIGN_REF_EDI);
+
+ noway_assert(regToHelper[1][REG_EAX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EAX);
+ noway_assert(regToHelper[1][REG_ECX] == CORINFO_HELP_CHECKED_ASSIGN_REF_ECX);
+ noway_assert(regToHelper[1][REG_EBX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBX);
+ noway_assert(regToHelper[1][REG_ESP] == -1);
+ noway_assert(regToHelper[1][REG_EBP] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBP);
+ noway_assert(regToHelper[1][REG_ESI] == CORINFO_HELP_CHECKED_ASSIGN_REF_ESI);
+ noway_assert(regToHelper[1][REG_EDI] == CORINFO_HELP_CHECKED_ASSIGN_REF_EDI);
+
+ regNumber reg = data->gtRegNum;
+ noway_assert((reg != REG_ESP) && (reg != REG_WRITE_BARRIER));
+
+ // Generate the following code:
+ // lea edx, addr
+ // call write_barrier_helper_reg
+
+ // addr goes in REG_ARG_0
+ if (addr->gtRegNum != REG_WRITE_BARRIER) // REVIEW: can it ever not already by in this register?
+ {
+ inst_RV_RV(INS_mov, REG_WRITE_BARRIER, addr->gtRegNum, addr->TypeGet());
+ }
+
+ unsigned tgtAnywhere = 0;
+ if (writeBarrierForm != GCInfo::WBF_BarrierUnchecked)
+ {
+ tgtAnywhere = 1;
+ }
+
+ // We might want to call a modified version of genGCWriteBarrier() to get the benefit of
+ // the FEATURE_COUNT_GC_WRITE_BARRIERS code there, but that code doesn't look like it works
+ // with rationalized RyuJIT IR. So, for now, just emit the helper call directly here.
+
+ genEmitHelperCall(regToHelper[tgtAnywhere][reg],
+ 0, // argSize
+ EA_PTRSIZE); // retSize
+
+ return true;
+#else // !defined(_TARGET_X86_) || !NOGC_WRITE_BARRIERS
+ return false;
+#endif // !defined(_TARGET_X86_) || !NOGC_WRITE_BARRIERS
+}
+
+// Produce code for a GT_CALL node
+void CodeGen::genCallInstruction(GenTreePtr node)
+{
+ GenTreeCall* call = node->AsCall();
+ assert(call->gtOper == GT_CALL);
+
+ gtCallTypes callType = (gtCallTypes)call->gtCallType;
+
+ IL_OFFSETX ilOffset = BAD_IL_OFFSET;
+
+ // all virtuals should have been expanded into a control expression
+ assert(!call->IsVirtual() || call->gtControlExpr || call->gtCallAddr);
+
+ // Consume all the arg regs
+ for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+ {
+ assert(list->IsList());
+
+ GenTreePtr argNode = list->Current();
+
+ fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode->gtSkipReloadOrCopy());
+ assert(curArgTabEntry);
+
+ if (curArgTabEntry->regNum == REG_STK)
+ {
+ continue;
+ }
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+ // Deal with multi register passed struct args.
+ if (argNode->OperGet() == GT_LIST)
+ {
+ GenTreeArgList* argListPtr = argNode->AsArgList();
+ unsigned iterationNum = 0;
+ for (; argListPtr != nullptr; argListPtr = argListPtr->Rest(), iterationNum++)
+ {
+ GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1;
+ assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+ regNumber argReg = REG_NA;
+
+ if (iterationNum == 0)
+ {
+ argReg = curArgTabEntry->regNum;
+ }
+ else
+ {
+ assert(iterationNum == 1);
+ argReg = curArgTabEntry->otherRegNum;
+ }
+
+ genConsumeReg(putArgRegNode);
+
+ // Validate the putArgRegNode has the right type.
+ assert(putArgRegNode->TypeGet() ==
+ compiler->GetTypeFromClassificationAndSizes(curArgTabEntry->structDesc
+ .eightByteClassifications[iterationNum],
+ curArgTabEntry->structDesc
+ .eightByteSizes[iterationNum]));
+ if (putArgRegNode->gtRegNum != argReg)
+ {
+ inst_RV_RV(ins_Move_Extend(putArgRegNode->TypeGet(), putArgRegNode->InReg()), argReg,
+ putArgRegNode->gtRegNum);
+ }
+ }
+ }
+ else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+ {
+ regNumber argReg = curArgTabEntry->regNum;
+ genConsumeReg(argNode);
+ if (argNode->gtRegNum != argReg)
+ {
+ inst_RV_RV(ins_Move_Extend(argNode->TypeGet(), argNode->InReg()), argReg, argNode->gtRegNum);
+ }
+ }
+
+#if FEATURE_VARARG
+ // In the case of a varargs call,
+ // the ABI dictates that if we have floating point args,
+ // we must pass the enregistered arguments in both the
+ // integer and floating point registers so, let's do that.
+ if (call->IsVarargs() && varTypeIsFloating(argNode))
+ {
+ regNumber targetReg = compiler->getCallArgIntRegister(argNode->gtRegNum);
+ instruction ins = ins_CopyFloatToInt(argNode->TypeGet(), TYP_LONG);
+ inst_RV_RV(ins, argNode->gtRegNum, targetReg);
+ }
+#endif // FEATURE_VARARG
+ }
+
+#if defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+ // The call will pop its arguments.
+ // for each putarg_stk:
+ ssize_t stackArgBytes = 0;
+ GenTreePtr args = call->gtCallArgs;
+ while (args)
+ {
+ GenTreePtr arg = args->gtOp.gtOp1;
+ if (arg->OperGet() != GT_ARGPLACE && !(arg->gtFlags & GTF_LATE_ARG))
+ {
+#if defined(_TARGET_X86_)
+ assert((arg->OperGet() == GT_PUTARG_STK) || (arg->OperGet() == GT_LONG));
+ if (arg->OperGet() == GT_LONG)
+ {
+ assert((arg->gtGetOp1()->OperGet() == GT_PUTARG_STK) && (arg->gtGetOp2()->OperGet() == GT_PUTARG_STK));
+ }
+#endif // defined(_TARGET_X86_)
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+ if (genActualType(arg->TypeGet()) == TYP_STRUCT)
+ {
+ assert(arg->OperGet() == GT_PUTARG_STK);
+
+ GenTreeObj* obj = arg->gtGetOp1()->AsObj();
+ stackArgBytes = compiler->info.compCompHnd->getClassSize(obj->gtClass);
+ }
+ else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+ stackArgBytes += genTypeSize(genActualType(arg->TypeGet()));
+ }
+ args = args->gtOp.gtOp2;
+ }
+#endif // defined(_TARGET_X86_) || defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+ // Insert a null check on "this" pointer if asked.
+ if (call->NeedsNullCheck())
+ {
+ const regNumber regThis = genGetThisArgReg(call);
+ getEmitter()->emitIns_AR_R(INS_cmp, EA_4BYTE, regThis, regThis, 0);
+ }
+
+ // Either gtControlExpr != null or gtCallAddr != null or it is a direct non-virtual call to a user or helper method.
+ CORINFO_METHOD_HANDLE methHnd;
+ GenTree* target = call->gtControlExpr;
+ if (callType == CT_INDIRECT)
+ {
+ assert(target == nullptr);
+ target = call->gtCall.gtCallAddr;
+ methHnd = nullptr;
+ }
+ else
+ {
+ methHnd = call->gtCallMethHnd;
+ }
+
+ CORINFO_SIG_INFO* sigInfo = nullptr;
+#ifdef DEBUG
+ // Pass the call signature information down into the emitter so the emitter can associate
+ // native call sites with the signatures they were generated from.
+ if (callType != CT_HELPER)
+ {
+ sigInfo = call->callSig;
+ }
+#endif // DEBUG
+
+ // If fast tail call, then we are done. In this case we setup the args (both reg args
+ // and stack args in incoming arg area) and call target in rax. Epilog sequence would
+ // generate "jmp rax".
+ if (call->IsFastTailCall())
+ {
+ // Don't support fast tail calling JIT helpers
+ assert(callType != CT_HELPER);
+
+ // Fast tail calls materialize call target either in gtControlExpr or in gtCallAddr.
+ assert(target != nullptr);
+
+ genConsumeReg(target);
+ if (target->gtRegNum != REG_RAX)
+ {
+ inst_RV_RV(INS_mov, REG_RAX, target->gtRegNum);
+ }
+ return;
+ }
+
+ // For a pinvoke to unmanged code we emit a label to clear
+ // the GC pointer state before the callsite.
+ // We can't utilize the typical lazy killing of GC pointers
+ // at (or inside) the callsite.
+ if (call->IsUnmanaged())
+ {
+ genDefineTempLabel(genCreateTempLabel());
+ }
+
+ // Determine return value size(s).
+ ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+ emitAttr retSize = EA_PTRSIZE;
+ emitAttr secondRetSize = EA_UNKNOWN;
+
+ if (call->HasMultiRegRetVal())
+ {
+ retSize = emitTypeSize(retTypeDesc->GetReturnRegType(0));
+ secondRetSize = emitTypeSize(retTypeDesc->GetReturnRegType(1));
+ }
+ else
+ {
+ assert(!varTypeIsStruct(call));
+
+ if (call->gtType == TYP_REF || call->gtType == TYP_ARRAY)
+ {
+ retSize = EA_GCREF;
+ }
+ else if (call->gtType == TYP_BYREF)
+ {
+ retSize = EA_BYREF;
+ }
+ }
+
+ bool fPossibleSyncHelperCall = false;
+ CorInfoHelpFunc helperNum = CORINFO_HELP_UNDEF;
+
+#ifdef DEBUGGING_SUPPORT
+ // We need to propagate the IL offset information to the call instruction, so we can emit
+ // an IL to native mapping record for the call, to support managed return value debugging.
+ // We don't want tail call helper calls that were converted from normal calls to get a record,
+ // so we skip this hash table lookup logic in that case.
+ if (compiler->opts.compDbgInfo && compiler->genCallSite2ILOffsetMap != nullptr && !call->IsTailCall())
+ {
+ (void)compiler->genCallSite2ILOffsetMap->Lookup(call, &ilOffset);
+ }
+#endif // DEBUGGING_SUPPORT
+
+#if defined(_TARGET_X86_)
+ // If the callee pops the arguments, we pass a positive value as the argSize, and the emitter will
+ // adjust its stack level accordingly.
+ // If the caller needs to explicitly pop its arguments, we must pass a negative value, and then do the
+ // pop when we're done.
+ ssize_t argSizeForEmitter = stackArgBytes;
+ if ((call->gtFlags & GTF_CALL_POP_ARGS) != 0)
+ {
+ argSizeForEmitter = -stackArgBytes;
+ }
+
+#endif // defined(_TARGET_X86_)
+
+ if (target != nullptr)
+ {
+ if (target->isContainedIndir())
+ {
+ if (target->AsIndir()->HasBase() && target->AsIndir()->Base()->isContainedIntOrIImmed())
+ {
+ // Note that if gtControlExpr is an indir of an absolute address, we mark it as
+ // contained only if it can be encoded as PC-relative offset.
+ assert(target->AsIndir()->Base()->AsIntConCommon()->FitsInAddrBase(compiler));
+
+ genEmitCall(emitter::EC_FUNC_TOKEN_INDIR, methHnd,
+ INDEBUG_LDISASM_COMMA(sigInfo)(void*) target->AsIndir()
+ ->Base()
+ ->AsIntConCommon()
+ ->IconValue() X86_ARG(argSizeForEmitter),
+ retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), ilOffset);
+ }
+ else
+ {
+ genEmitCall(emitter::EC_INDIR_ARD, methHnd,
+ INDEBUG_LDISASM_COMMA(sigInfo) target->AsIndir() X86_ARG(argSizeForEmitter),
+ retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), ilOffset);
+ }
+ }
+ else
+ {
+ // We have already generated code for gtControlExpr evaluating it into a register.
+ // We just need to emit "call reg" in this case.
+ assert(genIsValidIntReg(target->gtRegNum));
+ genEmitCall(emitter::EC_INDIR_R, methHnd,
+ INDEBUG_LDISASM_COMMA(sigInfo) nullptr // addr
+ X86_ARG(argSizeForEmitter),
+ retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), ilOffset, genConsumeReg(target));
+ }
+ }
+#ifdef FEATURE_READYTORUN_COMPILER
+ else if (call->gtEntryPoint.addr != nullptr)
+ {
+ genEmitCall((call->gtEntryPoint.accessType == IAT_VALUE) ? emitter::EC_FUNC_TOKEN
+ : emitter::EC_FUNC_TOKEN_INDIR,
+ methHnd, INDEBUG_LDISASM_COMMA(sigInfo)(void*) call->gtEntryPoint.addr X86_ARG(argSizeForEmitter),
+ retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), ilOffset);
+ }
+#endif
+ else
+ {
+ // Generate a direct call to a non-virtual user defined or helper method
+ assert(callType == CT_HELPER || callType == CT_USER_FUNC);
+
+ void* addr = nullptr;
+ if (callType == CT_HELPER)
+ {
+ // Direct call to a helper method.
+ helperNum = compiler->eeGetHelperNum(methHnd);
+ noway_assert(helperNum != CORINFO_HELP_UNDEF);
+
+ void* pAddr = nullptr;
+ addr = compiler->compGetHelperFtn(helperNum, (void**)&pAddr);
+
+ if (addr == nullptr)
+ {
+ addr = pAddr;
+ }
+
+ // tracking of region protected by the monitor in synchronized methods
+ if (compiler->info.compFlags & CORINFO_FLG_SYNCH)
+ {
+ fPossibleSyncHelperCall = true;
+ }
+ }
+ else
+ {
+ // Direct call to a non-virtual user function.
+ addr = call->gtDirectCallAddress;
+ }
+
+ // Non-virtual direct calls to known addresses
+ genEmitCall(emitter::EC_FUNC_TOKEN, methHnd, INDEBUG_LDISASM_COMMA(sigInfo) addr X86_ARG(argSizeForEmitter),
+ retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize), ilOffset);
+ }
+
+ // if it was a pinvoke we may have needed to get the address of a label
+ if (genPendingCallLabel)
+ {
+ assert(call->IsUnmanaged());
+ genDefineTempLabel(genPendingCallLabel);
+ genPendingCallLabel = nullptr;
+ }
+
+#if defined(_TARGET_X86_)
+ // The call will pop its arguments.
+ genStackLevel -= stackArgBytes;
+#endif // defined(_TARGET_X86_)
+
+ // Update GC info:
+ // All Callee arg registers are trashed and no longer contain any GC pointers.
+ // TODO-XArch-Bug?: As a matter of fact shouldn't we be killing all of callee trashed regs here?
+ // For now we will assert that other than arg regs gc ref/byref set doesn't contain any other
+ // registers from RBM_CALLEE_TRASH.
+ assert((gcInfo.gcRegGCrefSetCur & (RBM_CALLEE_TRASH & ~RBM_ARG_REGS)) == 0);
+ assert((gcInfo.gcRegByrefSetCur & (RBM_CALLEE_TRASH & ~RBM_ARG_REGS)) == 0);
+ gcInfo.gcRegGCrefSetCur &= ~RBM_ARG_REGS;
+ gcInfo.gcRegByrefSetCur &= ~RBM_ARG_REGS;
+
+ var_types returnType = call->TypeGet();
+ if (returnType != TYP_VOID)
+ {
+#ifdef _TARGET_X86_
+ if (varTypeIsFloating(returnType))
+ {
+ // Spill the value from the fp stack.
+ // Then, load it into the target register.
+ call->gtFlags |= GTF_SPILL;
+ regSet.rsSpillFPStack(call);
+ call->gtFlags |= GTF_SPILLED;
+ call->gtFlags &= ~GTF_SPILL;
+ }
+ else
+#endif // _TARGET_X86_
+ {
+ regNumber returnReg;
+
+ if (call->HasMultiRegRetVal())
+ {
+ assert(retTypeDesc != nullptr);
+ unsigned regCount = retTypeDesc->GetReturnRegCount();
+
+ // If regs allocated to call node are different from ABI return
+ // regs in which the call has returned its result, move the result
+ // to regs allocated to call node.
+ for (unsigned i = 0; i < regCount; ++i)
+ {
+ var_types regType = retTypeDesc->GetReturnRegType(i);
+ returnReg = retTypeDesc->GetABIReturnReg(i);
+ regNumber allocatedReg = call->GetRegNumByIdx(i);
+ if (returnReg != allocatedReg)
+ {
+ inst_RV_RV(ins_Copy(regType), allocatedReg, returnReg, regType);
+ }
+ }
+
+#ifdef FEATURE_SIMD
+ // A Vector3 return value is stored in xmm0 and xmm1.
+ // RyuJIT assumes that the upper unused bits of xmm1 are cleared but
+ // the native compiler doesn't guarantee it.
+ if (returnType == TYP_SIMD12)
+ {
+ returnReg = retTypeDesc->GetABIReturnReg(1);
+ // Clear the upper 32 bits by two shift instructions.
+ // retReg = retReg << 96
+ // retReg = retReg >> 96
+ getEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), returnReg, 12);
+ getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), returnReg, 12);
+ }
+#endif // FEATURE_SIMD
+ }
+ else
+ {
+#ifdef _TARGET_X86_
+ if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
+ {
+ // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
+ // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
+ // correct argument registers.
+ returnReg = REG_PINVOKE_TCB;
+ }
+ else
+#endif // _TARGET_X86_
+ if (varTypeIsFloating(returnType))
+ {
+ returnReg = REG_FLOATRET;
+ }
+ else
+ {
+ returnReg = REG_INTRET;
+ }
+
+ if (call->gtRegNum != returnReg)
+ {
+ inst_RV_RV(ins_Copy(returnType), call->gtRegNum, returnReg, returnType);
+ }
+ }
+
+ genProduceReg(call);
+ }
+ }
+
+ // If there is nothing next, that means the result is thrown away, so this value is not live.
+ // However, for minopts or debuggable code, we keep it live to support managed return value debugging.
+ if ((call->gtNext == nullptr) && !compiler->opts.MinOpts() && !compiler->opts.compDbgCode)
+ {
+ gcInfo.gcMarkRegSetNpt(RBM_INTRET);
+ }
+
+#if defined(_TARGET_X86_)
+ //-------------------------------------------------------------------------
+ // Create a label for tracking of region protected by the monitor in synchronized methods.
+ // This needs to be here, rather than above where fPossibleSyncHelperCall is set,
+ // so the GC state vars have been updated before creating the label.
+
+ if (fPossibleSyncHelperCall)
+ {
+ switch (helperNum)
+ {
+ case CORINFO_HELP_MON_ENTER:
+ case CORINFO_HELP_MON_ENTER_STATIC:
+ noway_assert(compiler->syncStartEmitCookie == NULL);
+ compiler->syncStartEmitCookie =
+ getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur);
+ noway_assert(compiler->syncStartEmitCookie != NULL);
+ break;
+ case CORINFO_HELP_MON_EXIT:
+ case CORINFO_HELP_MON_EXIT_STATIC:
+ noway_assert(compiler->syncEndEmitCookie == NULL);
+ compiler->syncEndEmitCookie =
+ getEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur);
+ noway_assert(compiler->syncEndEmitCookie != NULL);
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Is the caller supposed to pop the arguments?
+ if (((call->gtFlags & GTF_CALL_POP_ARGS) != 0) && (stackArgBytes != 0))
+ {
+ genAdjustSP(stackArgBytes);
+ }
+#endif // _TARGET_X86_
+}
+
+// Produce code for a GT_JMP node.
+// The arguments of the caller needs to be transferred to the callee before exiting caller.
+// The actual jump to callee is generated as part of caller epilog sequence.
+// Therefore the codegen of GT_JMP is to ensure that the callee arguments are correctly setup.
+void CodeGen::genJmpMethod(GenTreePtr jmp)
+{
+ assert(jmp->OperGet() == GT_JMP);
+ assert(compiler->compJmpOpUsed);
+
+ // If no arguments, nothing to do
+ if (compiler->info.compArgsCount == 0)
+ {
+ return;
+ }
+
+ // Make sure register arguments are in their initial registers
+ // and stack arguments are put back as well.
+ unsigned varNum;
+ LclVarDsc* varDsc;
+
+ // First move any en-registered stack arguments back to the stack.
+ // At the same time any reg arg not in correct reg is moved back to its stack location.
+ //
+ // We are not strictly required to spill reg args that are not in the desired reg for a jmp call
+ // But that would require us to deal with circularity while moving values around. Spilling
+ // to stack makes the implementation simple, which is not a bad trade off given Jmp calls
+ // are not frequent.
+ for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++)
+ {
+ varDsc = compiler->lvaTable + varNum;
+
+ if (varDsc->lvPromoted)
+ {
+ noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here
+
+ unsigned fieldVarNum = varDsc->lvFieldLclStart;
+ varDsc = compiler->lvaTable + fieldVarNum;
+ }
+ noway_assert(varDsc->lvIsParam);
+
+ if (varDsc->lvIsRegArg && (varDsc->lvRegNum != REG_STK))
+ {
+ // Skip reg args which are already in its right register for jmp call.
+ // If not, we will spill such args to their stack locations.
+ //
+ // If we need to generate a tail call profiler hook, then spill all
+ // arg regs to free them up for the callback.
+ if (!compiler->compIsProfilerHookNeeded() && (varDsc->lvRegNum == varDsc->lvArgReg))
+ {
+ continue;
+ }
+ }
+ else if (varDsc->lvRegNum == REG_STK)
+ {
+ // Skip args which are currently living in stack.
+ continue;
+ }
+
+ // If we came here it means either a reg argument not in the right register or
+ // a stack argument currently living in a register. In either case the following
+ // assert should hold.
+ assert(varDsc->lvRegNum != REG_STK);
+
+ var_types loadType = varDsc->lvaArgType();
+ getEmitter()->emitIns_S_R(ins_Store(loadType), emitTypeSize(loadType), varDsc->lvRegNum, varNum, 0);
+
+ // Update lvRegNum life and GC info to indicate lvRegNum is dead and varDsc stack slot is going live.
+ // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
+ // Therefore manually update life of varDsc->lvRegNum.
+ regMaskTP tempMask = varDsc->lvRegMask();
+ regSet.RemoveMaskVars(tempMask);
+ gcInfo.gcMarkRegSetNpt(tempMask);
+ if (compiler->lvaIsGCTracked(varDsc))
+ {
+#ifdef DEBUG
+ if (!VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
+ {
+ JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming live\n", varNum);
+ }
+ else
+ {
+ JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing live\n", varNum);
+ }
+#endif // DEBUG
+
+ VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+ }
+ }
+
+#ifdef PROFILING_SUPPORTED
+ // At this point all arg regs are free.
+ // Emit tail call profiler callback.
+ genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL);
+#endif
+
+ // Next move any un-enregistered register arguments back to their register.
+ regMaskTP fixedIntArgMask = RBM_NONE; // tracks the int arg regs occupying fixed args in case of a vararg method.
+ unsigned firstArgVarNum = BAD_VAR_NUM; // varNum of the first argument in case of a vararg method.
+ for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++)
+ {
+ varDsc = compiler->lvaTable + varNum;
+ if (varDsc->lvPromoted)
+ {
+ noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here
+
+ unsigned fieldVarNum = varDsc->lvFieldLclStart;
+ varDsc = compiler->lvaTable + fieldVarNum;
+ }
+ noway_assert(varDsc->lvIsParam);
+
+ // Skip if arg not passed in a register.
+ if (!varDsc->lvIsRegArg)
+ {
+ continue;
+ }
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+ if (varTypeIsStruct(varDsc))
+ {
+ CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
+ assert(typeHnd != nullptr);
+
+ SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+ compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+ assert(structDesc.passedInRegisters);
+
+ unsigned __int8 offset0 = 0;
+ unsigned __int8 offset1 = 0;
+ var_types type0 = TYP_UNKNOWN;
+ var_types type1 = TYP_UNKNOWN;
+
+ // Get the eightbyte data
+ compiler->GetStructTypeOffset(structDesc, &type0, &type1, &offset0, &offset1);
+
+ // Move the values into the right registers.
+ //
+
+ // Update varDsc->lvArgReg and lvOtherArgReg life and GC Info to indicate varDsc stack slot is dead and
+ // argReg is going live. Note that we cannot modify varDsc->lvRegNum and lvOtherArgReg here because another
+ // basic block may not be expecting it. Therefore manually update life of argReg. Note that GT_JMP marks
+ // the end of the basic block and after which reg life and gc info will be recomputed for the new block in
+ // genCodeForBBList().
+ if (type0 != TYP_UNKNOWN)
+ {
+ getEmitter()->emitIns_R_S(ins_Load(type0), emitTypeSize(type0), varDsc->lvArgReg, varNum, offset0);
+ regSet.rsMaskVars |= genRegMask(varDsc->lvArgReg);
+ gcInfo.gcMarkRegPtrVal(varDsc->lvArgReg, type0);
+ }
+
+ if (type1 != TYP_UNKNOWN)
+ {
+ getEmitter()->emitIns_R_S(ins_Load(type1), emitTypeSize(type1), varDsc->lvOtherArgReg, varNum, offset1);
+ regSet.rsMaskVars |= genRegMask(varDsc->lvOtherArgReg);
+ gcInfo.gcMarkRegPtrVal(varDsc->lvOtherArgReg, type1);
+ }
+
+ if (varDsc->lvTracked)
+ {
+ VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+ }
+ }
+ else
+#endif // !defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+ {
+ // Register argument
+ noway_assert(isRegParamType(genActualType(varDsc->TypeGet())));
+
+ // Is register argument already in the right register?
+ // If not load it from its stack location.
+ var_types loadType = varDsc->lvaArgType();
+ regNumber argReg = varDsc->lvArgReg; // incoming arg register
+
+ if (varDsc->lvRegNum != argReg)
+ {
+ assert(genIsValidReg(argReg));
+ getEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
+
+ // Update argReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
+ // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
+ // Therefore manually update life of argReg. Note that GT_JMP marks the end of the basic block
+ // and after which reg life and gc info will be recomputed for the new block in genCodeForBBList().
+ regSet.AddMaskVars(genRegMask(argReg));
+ gcInfo.gcMarkRegPtrVal(argReg, loadType);
+ if (compiler->lvaIsGCTracked(varDsc))
+ {
+#ifdef DEBUG
+ if (VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
+ {
+ JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming dead\n", varNum);
+ }
+ else
+ {
+ JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing dead\n", varNum);
+ }
+#endif // DEBUG
+
+ VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+ }
+ }
+ }
+
+#if FEATURE_VARARG && defined(_TARGET_AMD64_)
+ // In case of a jmp call to a vararg method also pass the float/double arg in the corresponding int arg
+ // register. This is due to the AMD64 ABI which requires floating point values passed to varargs functions to
+ // be passed in both integer and floating point registers. It doesn't apply to x86, which passes floating point
+ // values on the stack.
+ if (compiler->info.compIsVarArgs)
+ {
+ regNumber intArgReg;
+ var_types loadType = varDsc->lvaArgType();
+ regNumber argReg = varDsc->lvArgReg; // incoming arg register
+
+ if (varTypeIsFloating(loadType))
+ {
+ intArgReg = compiler->getCallArgIntRegister(argReg);
+ instruction ins = ins_CopyFloatToInt(loadType, TYP_LONG);
+ inst_RV_RV(ins, argReg, intArgReg, loadType);
+ }
+ else
+ {
+ intArgReg = argReg;
+ }
+
+ fixedIntArgMask |= genRegMask(intArgReg);
+
+ if (intArgReg == REG_ARG_0)
+ {
+ assert(firstArgVarNum == BAD_VAR_NUM);
+ firstArgVarNum = varNum;
+ }
+ }
+#endif // FEATURE_VARARG
+ }
+
+#if FEATURE_VARARG && defined(_TARGET_AMD64_)
+ // Jmp call to a vararg method - if the method has fewer than 4 fixed arguments,
+ // load the remaining arg registers (both int and float) from the corresponding
+ // shadow stack slots. This is for the reason that we don't know the number and type
+ // of non-fixed params passed by the caller, therefore we have to assume the worst case
+ // of caller passing float/double args both in int and float arg regs.
+ //
+ // This doesn't apply to x86, which doesn't pass floating point values in floating
+ // point registers.
+ //
+ // The caller could have passed gc-ref/byref type var args. Since these are var args
+ // the callee no way of knowing their gc-ness. Therefore, mark the region that loads
+ // remaining arg registers from shadow stack slots as non-gc interruptible.
+ if (fixedIntArgMask != RBM_NONE)
+ {
+ assert(compiler->info.compIsVarArgs);
+ assert(firstArgVarNum != BAD_VAR_NUM);
+
+ regMaskTP remainingIntArgMask = RBM_ARG_REGS & ~fixedIntArgMask;
+ if (remainingIntArgMask != RBM_NONE)
+ {
+ instruction insCopyIntToFloat = ins_CopyIntToFloat(TYP_LONG, TYP_DOUBLE);
+ getEmitter()->emitDisableGC();
+ for (int argNum = 0, argOffset = 0; argNum < MAX_REG_ARG; ++argNum)
+ {
+ regNumber argReg = intArgRegs[argNum];
+ regMaskTP argRegMask = genRegMask(argReg);
+
+ if ((remainingIntArgMask & argRegMask) != 0)
+ {
+ remainingIntArgMask &= ~argRegMask;
+ getEmitter()->emitIns_R_S(INS_mov, EA_8BYTE, argReg, firstArgVarNum, argOffset);
+
+ // also load it in corresponding float arg reg
+ regNumber floatReg = compiler->getCallArgFloatRegister(argReg);
+ inst_RV_RV(insCopyIntToFloat, floatReg, argReg);
+ }
+
+ argOffset += REGSIZE_BYTES;
+ }
+ getEmitter()->emitEnableGC();
+ }
+ }
+#endif // FEATURE_VARARG
+}
+
+// produce code for a GT_LEA subnode
+void CodeGen::genLeaInstruction(GenTreeAddrMode* lea)
+{
+ emitAttr size = emitTypeSize(lea);
+ genConsumeOperands(lea);
+
+ if (lea->Base() && lea->Index())
+ {
+ regNumber baseReg = lea->Base()->gtRegNum;
+ regNumber indexReg = lea->Index()->gtRegNum;
+ getEmitter()->emitIns_R_ARX(INS_lea, size, lea->gtRegNum, baseReg, indexReg, lea->gtScale, lea->gtOffset);
+ }
+ else if (lea->Base())
+ {
+ getEmitter()->emitIns_R_AR(INS_lea, size, lea->gtRegNum, lea->Base()->gtRegNum, lea->gtOffset);
+ }
+ else if (lea->Index())
+ {
+ getEmitter()->emitIns_R_ARX(INS_lea, size, lea->gtRegNum, REG_NA, lea->Index()->gtRegNum, lea->gtScale,
+ lea->gtOffset);
+ }
+
+ genProduceReg(lea);
+}
+
+//-------------------------------------------------------------------------------------------
+// genJumpKindsForTree: Determine the number and kinds of conditional branches
+// necessary to implement the given GT_CMP node
+//
+// Arguments:
+// cmpTree - (input) The GenTree node that is used to set the Condition codes
+// - The GenTree Relop node that was used to set the Condition codes
+// jmpKind[2] - (output) One or two conditional branch instructions
+// jmpToTrueLabel[2] - (output) When true we branch to the true case
+// When false we create a second label and branch to the false case
+// Only GT_EQ for a floating point compares can have a false value.
+//
+// Return Value:
+// Sets the proper values into the array elements of jmpKind[] and jmpToTrueLabel[]
+//
+// Assumptions:
+// At least one conditional branch instruction will be returned.
+// Typically only one conditional branch is needed
+// and the second jmpKind[] value is set to EJ_NONE
+//
+// Notes:
+// jmpToTrueLabel[i]= true implies branch when the compare operation is true.
+// jmpToTrueLabel[i]= false implies branch when the compare operation is false.
+//-------------------------------------------------------------------------------------------
+
+// static
+void CodeGen::genJumpKindsForTree(GenTreePtr cmpTree, emitJumpKind jmpKind[2], bool jmpToTrueLabel[2])
+{
+ // Except for BEQ (= ordered GT_EQ) both jumps are to the true label.
+ jmpToTrueLabel[0] = true;
+ jmpToTrueLabel[1] = true;
+
+ // For integer comparisons just use genJumpKindForOper
+ if (!varTypeIsFloating(cmpTree->gtOp.gtOp1->gtEffectiveVal()))
+ {
+ CompareKind compareKind = ((cmpTree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
+ jmpKind[0] = genJumpKindForOper(cmpTree->gtOper, compareKind);
+ jmpKind[1] = EJ_NONE;
+ }
+ else
+ {
+ assert(cmpTree->OperIsCompare());
+
+ // For details on how we arrived at this mapping, see the comment block in genCodeForTreeNode()
+ // while generating code for compare opererators (e.g. GT_EQ etc).
+ if ((cmpTree->gtFlags & GTF_RELOP_NAN_UN) != 0)
+ {
+ // Must branch if we have an NaN, unordered
+ switch (cmpTree->gtOper)
+ {
+ case GT_LT:
+ case GT_GT:
+ jmpKind[0] = EJ_jb;
+ jmpKind[1] = EJ_NONE;
+ break;
+
+ case GT_LE:
+ case GT_GE:
+ jmpKind[0] = EJ_jbe;
+ jmpKind[1] = EJ_NONE;
+ break;
+
+ case GT_NE:
+ jmpKind[0] = EJ_jpe;
+ jmpKind[1] = EJ_jne;
+ break;
+
+ case GT_EQ:
+ jmpKind[0] = EJ_je;
+ jmpKind[1] = EJ_NONE;
+ break;
+
+ default:
+ unreached();
+ }
+ }
+ else // ((cmpTree->gtFlags & GTF_RELOP_NAN_UN) == 0)
+ {
+ // Do not branch if we have an NaN, unordered
+ switch (cmpTree->gtOper)
+ {
+ case GT_LT:
+ case GT_GT:
+ jmpKind[0] = EJ_ja;
+ jmpKind[1] = EJ_NONE;
+ break;
+
+ case GT_LE:
+ case GT_GE:
+ jmpKind[0] = EJ_jae;
+ jmpKind[1] = EJ_NONE;
+ break;
+
+ case GT_NE:
+ jmpKind[0] = EJ_jne;
+ jmpKind[1] = EJ_NONE;
+ break;
+
+ case GT_EQ:
+ jmpKind[0] = EJ_jpe;
+ jmpKind[1] = EJ_je;
+ jmpToTrueLabel[0] = false;
+ break;
+
+ default:
+ unreached();
+ }
+ }
+ }
+}
+
+#if !defined(_TARGET_64BIT_)
+//------------------------------------------------------------------------
+// genJumpKindsForTreeLongHi: Generate the jump types for compare
+// operators of the high parts of a compare with long type operands
+// on x86 for the case where rel-op result needs to be materialized into a
+// register.
+//
+// Arguments:
+// cmpTree - The GT_CMP node
+// jmpKind - Return array of jump kinds
+// jmpToTrueLabel - Return array of if the jump is going to true label
+//
+// Return Value:
+// None.
+//
+void CodeGen::genJumpKindsForTreeLongHi(GenTreePtr cmpTree, emitJumpKind jmpKind[2])
+{
+ assert(cmpTree->OperIsCompare());
+ CompareKind compareKind = ((cmpTree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
+
+ switch (cmpTree->gtOper)
+ {
+ case GT_LT:
+ case GT_LE:
+ if (compareKind == CK_SIGNED)
+ {
+ jmpKind[0] = EJ_jl;
+ jmpKind[1] = EJ_jg;
+ }
+ else
+ {
+ jmpKind[0] = EJ_jb;
+ jmpKind[1] = EJ_ja;
+ }
+ break;
+
+ case GT_GT:
+ case GT_GE:
+ if (compareKind == CK_SIGNED)
+ {
+ jmpKind[0] = EJ_jg;
+ jmpKind[1] = EJ_jl;
+ }
+ else
+ {
+ jmpKind[0] = EJ_ja;
+ jmpKind[1] = EJ_jb;
+ }
+ break;
+
+ case GT_EQ:
+ // GT_EQ will not jump to the true label if the hi parts are equal
+ jmpKind[0] = EJ_NONE;
+ jmpKind[1] = EJ_jne;
+ break;
+
+ case GT_NE:
+ // GT_NE will always jump to the true label if the high parts are not equal
+ jmpKind[0] = EJ_jne;
+ jmpKind[1] = EJ_NONE;
+ break;
+
+ default:
+ unreached();
+ }
+}
+
+//------------------------------------------------------------------------
+// genCompareLong: Generate code for comparing two longs on x86 when the result of the compare
+// is manifested in a register.
+//
+// Arguments:
+// treeNode - the compare tree
+//
+// Return Value:
+// None.
+// Comments:
+// For long compares, we need to compare the high parts of operands first, then the low parts.
+// If the high compare is false, we do not need to compare the low parts. For less than and
+// greater than, if the high compare is true, we can assume the entire compare is true. For
+// compares that are realized in a register, we will generate:
+//
+// Opcode x86 equivalent Comment
+// ------ -------------- -------
+// GT_EQ cmp hiOp1,hiOp2 If any part is not equal, the entire compare
+// jne label is false.
+// cmp loOp1,loOp2
+// label: sete
+//
+// GT_NE cmp hiOp1,hiOp2 If any part is not equal, the entire compare
+// jne label is true.
+// cmp loOp1,loOp2
+// label: setne
+//
+// GT_LT; unsigned cmp hiOp1,hiOp2 If hiOp1 is not equal to hiOp2, the flags are set
+// jne label correctly and we do not need to check lo. Otherwise,
+// cmp loOp1,loOp2 we need to compare the lo halves
+// label: setb
+//
+// GT_LE; unsigned cmp hiOp1,hiOp2 If hiOp1 is not equal to hiOp2, the flags are set
+// jne label correctly and we do not need to check lo. Otherwise,
+// cmp loOp1,loOp2 we need to compare the lo halves
+// label: setbe
+//
+// GT_GT; unsigned cmp hiOp1,hiOp2 If hiOp1 is not equal to hiOp2, the flags are set
+// jne label correctly and we do not need to check lo. Otherwise,
+// cmp loOp1,loOp2 we need to compare the lo halves
+// label: seta
+//
+// GT_GE; unsigned cmp hiOp1,hiOp2 If hiOp1 is not equal to hiOp2, the flags are set
+// jne label correctly and we do not need to check lo. Otherwise,
+// cmp loOp1,loOp2 we need to compare the lo halves
+// label: setae
+//
+// For signed long comparisons, we need additional labels, as we need to use signed conditions on the
+// "set" instruction:
+//
+// GT_LT; signed cmp hiOp1,hiOp2 If hiOp1 is not equal to hiOp2, the flags are set
+// jne labelHi correctly and we do not need to check lo. Otherwise,
+// cmp loOp1,loOp2 we need to compare the lo halves
+// setb Unsigned set for lo compare
+// jmp labelFinal
+// labelHi: setl Signed set for high compare
+// labelFinal:
+//
+// GT_LE; signed cmp hiOp1,hiOp2 If hiOp1 is not equal to hiOp2, the flags are set
+// jne labelHi correctly and we do not need to check lo. Otherwise,
+// cmp loOp1,loOp2 we need to compare the lo halves
+// setbe Unsigend set for lo compare
+// jmp labelFinal
+// labelHi: setle Signed set for hi compare
+// labelFinal:
+//
+// GT_GT; signed cmp hiOp1,hiOp2 If hiOp1 is not equal to hiOp2, the flags are set
+// jne labelHi correctly and we do not need to check lo. Otherwise,
+// cmp loOp1,loOp2 we need to compare the lo halves
+// seta Unsigned set for lo compare
+// jmp labelFinal
+// labelHi: setg Signed set for high compare
+// labelFinal
+//
+// GT_GE; signed cmp hiOp1,hiOp2 If hiOp1 is not equal to hiOp2, the flags are set
+// jne labelHi correctly and we do not need to check lo. Otherwise,
+// cmp loOp1,loOp2 we need to compare the lo halves
+// setae Unsigned set for lo compare
+// jmp labelFinal
+// labelHi: setge Signed set for hi compare
+// labelFinal:
+//
+// TODO-X86-CQ: Check if hi or lo parts of op2 are 0 and change the compare to a test.
+void CodeGen::genCompareLong(GenTreePtr treeNode)
+{
+ assert(treeNode->OperIsCompare());
+
+ GenTreeOp* tree = treeNode->AsOp();
+ GenTreePtr op1 = tree->gtOp1;
+ GenTreePtr op2 = tree->gtOp2;
+
+ assert(varTypeIsLong(op1->TypeGet()));
+ assert(varTypeIsLong(op2->TypeGet()));
+
+ regNumber targetReg = treeNode->gtRegNum;
+
+ genConsumeOperands(tree);
+
+ assert(targetReg != REG_NA);
+
+ GenTreePtr loOp1 = op1->gtGetOp1();
+ GenTreePtr hiOp1 = op1->gtGetOp2();
+ GenTreePtr loOp2 = op2->gtGetOp1();
+ GenTreePtr hiOp2 = op2->gtGetOp2();
+
+ // Create compare for the high parts
+ instruction ins = INS_cmp;
+ var_types cmpType = TYP_INT;
+ emitAttr cmpAttr = emitTypeSize(cmpType);
+
+ // Emit the compare instruction
+ getEmitter()->emitInsBinary(ins, cmpAttr, hiOp1, hiOp2);
+
+ // Generate the first jump for the high compare
+ CompareKind compareKind = ((tree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
+
+ BasicBlock* labelHi = genCreateTempLabel();
+ BasicBlock* labelFinal = genCreateTempLabel();
+
+ if (compareKind == CK_SIGNED && (tree->gtOper != GT_NE && tree->gtOper != GT_EQ))
+ {
+ // If we are doing a signed comparison, we need to do a signed set if the high compare is true,
+ // but an unsigned set if we fall through to the low compare. If we have a GT_NE or GT_EQ, we do not
+ // need to worry about the sign of the comparison, so we can use the simplified case.
+
+ // We only have to check for equality for the hi comparison. If they are not equal, then the set will
+ // do the right thing. If they are equal, we have to check the lo halves.
+ inst_JMP(EJ_jne, labelHi);
+
+ // Emit the comparison. Perform the set for the lo. Jump to labelFinal
+ getEmitter()->emitInsBinary(ins, cmpAttr, loOp1, loOp2);
+
+ // The low set must be unsigned
+ emitJumpKind jumpKindLo = genJumpKindForOper(tree->gtOper, CK_UNSIGNED);
+
+ inst_SET(jumpKindLo, targetReg);
+ // Set the higher bytes to 0
+ inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
+ genProduceReg(tree);
+
+ inst_JMP(EJ_jmp, labelFinal);
+
+ // Define the label for hi jump target here. If we have jumped here, we want to set
+ // the target register based on the jump kind of the actual compare type.
+
+ genDefineTempLabel(labelHi);
+ inst_SET(genJumpKindForOper(tree->gtOper, compareKind), targetReg);
+
+ // Set the higher bytes to 0
+ inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
+ genProduceReg(tree);
+
+ genDefineTempLabel(labelFinal);
+ }
+ else
+ {
+ // If the compare is unsigned, or if the sign doesn't change the set instruction, we can use
+ // the same set logic for both the hi and lo compare, so we don't need to jump to a high label,
+ // we can just jump to the set that the lo compare will use.
+
+ // We only have to check for equality for the hi comparison. If they are not equal, then the set will
+ // do the right thing. If they are equal, we have to check the lo halves.
+ inst_JMP(EJ_jne, labelFinal);
+
+ // Emit the comparison
+ getEmitter()->emitInsBinary(ins, cmpAttr, loOp1, loOp2);
+
+ // Define the label for hi jump target here. If we have jumped here, we want to set
+ // the target register based on the jump kind of the lower half (the actual compare
+ // type). If we have fallen through, then we are doing a normal int compare for the
+ // lower parts
+
+ genDefineTempLabel(labelFinal);
+
+ // The low set must be unsigned
+ emitJumpKind jumpKindLo = genJumpKindForOper(tree->gtOper, CK_UNSIGNED);
+
+ inst_SET(jumpKindLo, targetReg);
+ // Set the higher bytes to 0
+ inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
+ genProduceReg(tree);
+ }
+}
+
+//------------------------------------------------------------------------
+// genJTrueLong: Generate code for comparing two longs on x86 for the case where the result
+// is not manifested in a register.
+//
+// Arguments:
+// treeNode - the compare tree
+//
+// Return Value:
+// None.
+// Comments:
+// For long compares, we need to compare the high parts of operands first, then the low parts.
+// We only have to do the low compare if the high parts of the operands are equal.
+//
+// In the case where the result of a rel-op is not realized in a register, we generate:
+//
+// Opcode x86 equivalent Comment
+// ------ -------------- -------
+//
+// GT_LT; unsigned cmp hiOp1,hiOp2
+// jb trueLabel
+// ja falseLabel
+// cmp loOp1,loOp2
+// jb trueLabel
+// falseLabel:
+//
+// GT_LE; unsigned cmp hiOp1,hiOp2
+// jb trueLabel
+// ja falseLabel
+// cmp loOp1,loOp2
+// jbe trueLabel
+// falseLabel:
+//
+// GT_GT; unsigned cmp hiOp1,hiOp2
+// ja trueLabel
+// jb falseLabel
+// cmp loOp1,loOp2
+// ja trueLabel
+// falseLabel:
+//
+// GT_GE; unsigned cmp hiOp1,hiOp2
+// ja trueLabel
+// jb falseLabel
+// cmp loOp1,loOp2
+// jae trueLabel
+// falseLabel:
+//
+// GT_LT; signed cmp hiOp1,hiOp2
+// jl trueLabel
+// jg falseLabel
+// cmp loOp1,loOp2
+// jb trueLabel
+// falseLabel:
+//
+// GT_LE; signed cmp hiOp1,hiOp2
+// jl trueLabel
+// jg falseLabel
+// cmp loOp1,loOp2
+// jbe trueLabel
+// falseLabel:
+//
+// GT_GT; signed cmp hiOp1,hiOp2
+// jg trueLabel
+// jl falseLabel
+// cmp loOp1,loOp2
+// ja trueLabel
+// falseLabel:
+//
+// GT_GE; signed cmp hiOp1,hiOp2
+// jg trueLabel
+// jl falseLabel
+// cmp loOp1,loOp2
+// jae trueLabel
+// falseLabel:
+//
+// GT_EQ; cmp hiOp1,hiOp2
+// jne falseLabel
+// cmp loOp1,loOp2
+// je trueLabel
+// falseLabel:
+//
+// GT_NE; cmp hiOp1,hiOp2
+// jne labelTrue
+// cmp loOp1,loOp2
+// jne trueLabel
+// falseLabel:
+//
+// TODO-X86-CQ: Check if hi or lo parts of op2 are 0 and change the compare to a test.
+void CodeGen::genJTrueLong(GenTreePtr treeNode)
+{
+ assert(treeNode->OperIsCompare());
+
+ GenTreeOp* tree = treeNode->AsOp();
+ GenTreePtr op1 = tree->gtOp1;
+ GenTreePtr op2 = tree->gtOp2;
+
+ assert(varTypeIsLong(op1->TypeGet()));
+ assert(varTypeIsLong(op2->TypeGet()));
+
+ regNumber targetReg = treeNode->gtRegNum;
+
+ assert(targetReg == REG_NA);
+
+ GenTreePtr loOp1 = op1->gtGetOp1();
+ GenTreePtr hiOp1 = op1->gtGetOp2();
+ GenTreePtr loOp2 = op2->gtGetOp1();
+ GenTreePtr hiOp2 = op2->gtGetOp2();
+
+ // Emit the compare instruction
+ getEmitter()->emitInsBinary(INS_cmp, EA_4BYTE, hiOp1, hiOp2);
+
+ // Generate the first jump for the high compare
+ CompareKind compareKind = ((tree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
+
+ // TODO-X86-CQ: If the next block is a BBJ_ALWAYS, we can set falseLabel = compiler->compCurBB->bbNext->bbJumpDest.
+ BasicBlock* falseLabel = genCreateTempLabel();
+
+ emitJumpKind jumpKindHi[2];
+
+ // Generate the jumps for the high compare
+ genJumpKindsForTreeLongHi(tree, jumpKindHi);
+
+ BasicBlock* trueLabel = compiler->compCurBB->bbJumpDest;
+
+ if (jumpKindHi[0] != EJ_NONE)
+ {
+ inst_JMP(jumpKindHi[0], trueLabel);
+ }
+
+ if (jumpKindHi[1] != EJ_NONE)
+ {
+ inst_JMP(jumpKindHi[1], falseLabel);
+ }
+
+ // The low jump must be unsigned
+ emitJumpKind jumpKindLo = genJumpKindForOper(tree->gtOper, CK_UNSIGNED);
+
+ // Emit the comparison and the jump to the trueLabel
+ getEmitter()->emitInsBinary(INS_cmp, EA_4BYTE, loOp1, loOp2);
+
+ inst_JMP(jumpKindLo, trueLabel);
+
+ // Generate falseLabel, which is the false path. We will jump here if the high compare is false
+ // or fall through if the low compare is false.
+ genDefineTempLabel(falseLabel);
+}
+#endif //! defined(_TARGET_64BIT_)
+
+//------------------------------------------------------------------------
+// genCompareFloat: Generate code for comparing two floating point values
+//
+// Arguments:
+// treeNode - the compare tree
+//
+// Return Value:
+// None.
+// Comments:
+// SSE2 instruction ucomis[s|d] is performs unordered comparison and
+// updates rFLAGS register as follows.
+// Result of compare ZF PF CF
+// ----------------- ------------
+// Unordered 1 1 1 <-- this result implies one of operands of compare is a NAN.
+// Greater 0 0 0
+// Less Than 0 0 1
+// Equal 1 0 0
+//
+// From the above table the following equalities follow. As per ECMA spec *.UN opcodes perform
+// unordered comparison of floating point values. That is *.UN comparisons result in true when
+// one of the operands is a NaN whereas ordered comparisons results in false.
+//
+// Opcode Amd64 equivalent Comment
+// ------ ----------------- --------
+// BLT.UN(a,b) ucomis[s|d] a, b Jb branches if CF=1, which means either a<b or unordered from the above
+// jb table
+//
+// BLT(a,b) ucomis[s|d] b, a Ja branches if CF=0 and ZF=0, which means b>a that in turn implies a<b
+// ja
+//
+// BGT.UN(a,b) ucomis[s|d] b, a branch if b<a or unordered ==> branch if a>b or unordered
+// jb
+//
+// BGT(a, b) ucomis[s|d] a, b branch if a>b
+// ja
+//
+// BLE.UN(a,b) ucomis[s|d] a, b jbe branches if CF=1 or ZF=1, which implies a<=b or unordered
+// jbe
+//
+// BLE(a,b) ucomis[s|d] b, a jae branches if CF=0, which mean b>=a or a<=b
+// jae
+//
+// BGE.UN(a,b) ucomis[s|d] b, a branch if b<=a or unordered ==> branch if a>=b or unordered
+// jbe
+//
+// BGE(a,b) ucomis[s|d] a, b branch if a>=b
+// jae
+//
+// BEQ.UN(a,b) ucomis[s|d] a, b branch if a==b or unordered. There is no BEQ.UN opcode in ECMA spec.
+// je This case is given for completeness, in case if JIT generates such
+// a gentree internally.
+//
+// BEQ(a,b) ucomis[s|d] a, b From the above table, PF=0 and ZF=1 corresponds to a==b.
+// jpe L1
+// je <true label>
+// L1:
+//
+// BNE(a,b) ucomis[s|d] a, b branch if a!=b. There is no BNE opcode in ECMA spec. This case is
+// jne given for completeness, in case if JIT generates such a gentree
+// internally.
+//
+// BNE.UN(a,b) ucomis[s|d] a, b From the above table, PF=1 or ZF=0 implies unordered or a!=b
+// jpe <true label>
+// jne <true label>
+//
+// As we can see from the above equalities that the operands of a compare operator need to be
+// reveresed in case of BLT/CLT, BGT.UN/CGT.UN, BLE/CLE, BGE.UN/CGE.UN.
+void CodeGen::genCompareFloat(GenTreePtr treeNode)
+{
+ assert(treeNode->OperIsCompare());
+
+ GenTreeOp* tree = treeNode->AsOp();
+ GenTreePtr op1 = tree->gtOp1;
+ GenTreePtr op2 = tree->gtOp2;
+ var_types op1Type = op1->TypeGet();
+ var_types op2Type = op2->TypeGet();
+
+ genConsumeOperands(tree);
+
+ assert(varTypeIsFloating(op1Type));
+ assert(op1Type == op2Type);
+
+ regNumber targetReg = treeNode->gtRegNum;
+ instruction ins;
+ emitAttr cmpAttr;
+
+ bool reverseOps;
+ if ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0)
+ {
+ // Unordered comparison case
+ reverseOps = (tree->gtOper == GT_GT || tree->gtOper == GT_GE);
+ }
+ else
+ {
+ reverseOps = (tree->gtOper == GT_LT || tree->gtOper == GT_LE);
+ }
+
+ if (reverseOps)
+ {
+ GenTreePtr tmp = op1;
+ op1 = op2;
+ op2 = tmp;
+ }
+
+ ins = ins_FloatCompare(op1Type);
+ cmpAttr = emitTypeSize(op1Type);
+
+ getEmitter()->emitInsBinary(ins, cmpAttr, op1, op2);
+
+ // Are we evaluating this into a register?
+ if (targetReg != REG_NA)
+ {
+ genSetRegToCond(targetReg, tree);
+ genProduceReg(tree);
+ }
+}
+
+//------------------------------------------------------------------------
+// genCompareInt: Generate code for comparing ints or, on amd64, longs.
+//
+// Arguments:
+// treeNode - the compare tree
+//
+// Return Value:
+// None.
+void CodeGen::genCompareInt(GenTreePtr treeNode)
+{
+ assert(treeNode->OperIsCompare());
+
+ GenTreeOp* tree = treeNode->AsOp();
+ GenTreePtr op1 = tree->gtOp1;
+ GenTreePtr op2 = tree->gtOp2;
+ var_types op1Type = op1->TypeGet();
+ var_types op2Type = op2->TypeGet();
+
+ genConsumeOperands(tree);
+
+ instruction ins;
+ emitAttr cmpAttr;
+
+ regNumber targetReg = treeNode->gtRegNum;
+ assert(!op1->isContainedIntOrIImmed()); // We no longer support swapping op1 and op2 to generate cmp reg, imm
+ assert(!varTypeIsFloating(op2Type));
+
+#ifdef _TARGET_X86_
+ assert(!varTypeIsLong(op1Type) && !varTypeIsLong(op2Type));
+#endif // _TARGET_X86_
+
+ // By default we use an int32 sized cmp instruction
+ //
+ ins = INS_cmp;
+ var_types cmpType = TYP_INT;
+
+ // In the if/then/else statement below we may change the
+ // 'cmpType' and/or 'ins' to generate a smaller instruction
+
+ // Are we comparing two values that are the same size?
+ //
+ if (genTypeSize(op1Type) == genTypeSize(op2Type))
+ {
+ if (op1Type == op2Type)
+ {
+ // If both types are exactly the same we can use that type
+ cmpType = op1Type;
+ }
+ else if (genTypeSize(op1Type) == 8)
+ {
+ // If we have two different int64 types we need to use a long compare
+ cmpType = TYP_LONG;
+ }
+
+ cmpAttr = emitTypeSize(cmpType);
+ }
+ else // Here we know that (op1Type != op2Type)
+ {
+ // Do we have a short compare against a constant in op2?
+ //
+ // We checked for this case in LowerCmp() and if we can perform a small
+ // compare immediate we labeled this compare with a GTF_RELOP_SMALL
+ // and for unsigned small non-equality compares the GTF_UNSIGNED flag.
+ //
+ if (op2->isContainedIntOrIImmed() && ((tree->gtFlags & GTF_RELOP_SMALL) != 0))
+ {
+ assert(varTypeIsSmall(op1Type));
+ cmpType = op1Type;
+ }
+#ifdef _TARGET_AMD64_
+ else // compare two different sized operands
+ {
+ // For this case we don't want any memory operands, only registers or immediates
+ //
+ assert(!op1->isContainedMemoryOp());
+ assert(!op2->isContainedMemoryOp());
+
+ // Check for the case where one operand is an int64 type
+ // Lower should have placed 32-bit operand in a register
+ // for signed comparisons we will sign extend the 32-bit value in place.
+ //
+ bool op1Is64Bit = (genTypeSize(op1Type) == 8);
+ bool op2Is64Bit = (genTypeSize(op2Type) == 8);
+ if (op1Is64Bit)
+ {
+ cmpType = TYP_LONG;
+ if (!(tree->gtFlags & GTF_UNSIGNED) && !op2Is64Bit)
+ {
+ assert(op2->gtRegNum != REG_NA);
+ inst_RV_RV(INS_movsxd, op2->gtRegNum, op2->gtRegNum, op2Type);
+ }
+ }
+ else if (op2Is64Bit)
+ {
+ cmpType = TYP_LONG;
+ if (!(tree->gtFlags & GTF_UNSIGNED) && !op1Is64Bit)
+ {
+ assert(op1->gtRegNum != REG_NA);
+ }
+ }
+ }
+#endif // _TARGET_AMD64_
+
+ cmpAttr = emitTypeSize(cmpType);
+ }
+
+ // See if we can generate a "test" instruction instead of a "cmp".
+ // For this to generate the correct conditional branch we must have
+ // a compare against zero.
+ //
+ if (op2->IsIntegralConst(0))
+ {
+ if (op1->isContained())
+ {
+ // op1 can be a contained memory op
+ // or the special contained GT_AND that we created in Lowering::LowerCmp()
+ //
+ if ((op1->OperGet() == GT_AND))
+ {
+ noway_assert(op1->gtOp.gtOp2->isContainedIntOrIImmed());
+
+ ins = INS_test; // we will generate "test andOp1, andOp2CnsVal"
+ op2 = op1->gtOp.gtOp2; // must assign op2 before we overwrite op1
+ op1 = op1->gtOp.gtOp1; // overwrite op1
+
+ if (op1->isContainedMemoryOp())
+ {
+ // use the size andOp1 if it is a contained memoryop.
+ cmpAttr = emitTypeSize(op1->TypeGet());
+ }
+ // fallthrough to emit->emitInsBinary(ins, cmpAttr, op1, op2);
+ }
+ }
+ else // op1 is not contained thus it must be in a register
+ {
+ ins = INS_test;
+ op2 = op1; // we will generate "test reg1,reg1"
+ // fallthrough to emit->emitInsBinary(ins, cmpAttr, op1, op2);
+ }
+ }
+
+ getEmitter()->emitInsBinary(ins, cmpAttr, op1, op2);
+
+ // Are we evaluating this into a register?
+ if (targetReg != REG_NA)
+ {
+ genSetRegToCond(targetReg, tree);
+ genProduceReg(tree);
+ }
+}
+
+//-------------------------------------------------------------------------------------------
+// genSetRegToCond: Set a register 'dstReg' to the appropriate one or zero value
+// corresponding to a binary Relational operator result.
+//
+// Arguments:
+// dstReg - The target register to set to 1 or 0
+// tree - The GenTree Relop node that was used to set the Condition codes
+//
+// Return Value: none
+//
+// Notes:
+// A full 64-bit value of either 1 or 0 is setup in the 'dstReg'
+//-------------------------------------------------------------------------------------------
+
+void CodeGen::genSetRegToCond(regNumber dstReg, GenTreePtr tree)
+{
+ noway_assert((genRegMask(dstReg) & RBM_BYTE_REGS) != 0);
+
+ emitJumpKind jumpKind[2];
+ bool branchToTrueLabel[2];
+ genJumpKindsForTree(tree, jumpKind, branchToTrueLabel);
+
+ if (jumpKind[1] == EJ_NONE)
+ {
+ // Set (lower byte of) reg according to the flags
+ inst_SET(jumpKind[0], dstReg);
+ }
+ else
+ {
+#ifdef DEBUG
+ // jmpKind[1] != EJ_NONE implies BEQ and BEN.UN of floating point values.
+ // These are represented by two conditions.
+ if (tree->gtOper == GT_EQ)
+ {
+ // This must be an ordered comparison.
+ assert((tree->gtFlags & GTF_RELOP_NAN_UN) == 0);
+ }
+ else
+ {
+ // This must be BNE.UN
+ assert((tree->gtOper == GT_NE) && ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0));
+ }
+#endif
+
+ // Here is the sample code generated in each case:
+ // BEQ == cmp, jpe <false label>, je <true label>
+ // That is, to materialize comparison reg needs to be set if PF=0 and ZF=1
+ // setnp reg // if (PF==0) reg = 1 else reg = 0
+ // jpe L1 // Jmp if PF==1
+ // sete reg
+ // L1:
+ //
+ // BNE.UN == cmp, jpe <true label>, jne <true label>
+ // That is, to materialize the comparison reg needs to be set if either PF=1 or ZF=0;
+ // setp reg
+ // jpe L1
+ // setne reg
+ // L1:
+
+ // reverse the jmpkind condition before setting dstReg if it is to false label.
+ inst_SET(branchToTrueLabel[0] ? jumpKind[0] : emitter::emitReverseJumpKind(jumpKind[0]), dstReg);
+
+ BasicBlock* label = genCreateTempLabel();
+ inst_JMP(jumpKind[0], label);
+
+ // second branch is always to true label
+ assert(branchToTrueLabel[1]);
+ inst_SET(jumpKind[1], dstReg);
+ genDefineTempLabel(label);
+ }
+
+ var_types treeType = tree->TypeGet();
+ if (treeType == TYP_INT || treeType == TYP_LONG)
+ {
+ // Set the higher bytes to 0
+ inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), dstReg, dstReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
+ }
+ else
+ {
+ noway_assert(treeType == TYP_BYTE);
+ }
+}
+
+//------------------------------------------------------------------------
+// genIntToIntCast: Generate code for an integer cast
+// This method handles integer overflow checking casts
+// as well as ordinary integer casts.
+//
+// Arguments:
+// treeNode - The GT_CAST node
+//
+// Return Value:
+// None.
+//
+// Assumptions:
+// The treeNode is not a contained node and must have an assigned register.
+// For a signed convert from byte, the source must be in a byte-addressable register.
+// Neither the source nor target type can be a floating point type.
+//
+// TODO-XArch-CQ: Allow castOp to be a contained node without an assigned register.
+// TODO: refactor to use getCastDescription
+//
+void CodeGen::genIntToIntCast(GenTreePtr treeNode)
+{
+ assert(treeNode->OperGet() == GT_CAST);
+
+ GenTreePtr castOp = treeNode->gtCast.CastOp();
+ regNumber targetReg = treeNode->gtRegNum;
+ regNumber sourceReg = castOp->gtRegNum;
+ var_types dstType = treeNode->CastToType();
+ bool isUnsignedDst = varTypeIsUnsigned(dstType);
+ var_types srcType = genActualType(castOp->TypeGet());
+ bool isUnsignedSrc = varTypeIsUnsigned(srcType);
+
+ // if necessary, force the srcType to unsigned when the GT_UNSIGNED flag is set
+ if (!isUnsignedSrc && (treeNode->gtFlags & GTF_UNSIGNED) != 0)
+ {
+ srcType = genUnsignedType(srcType);
+ isUnsignedSrc = true;
+ }
+
+ bool requiresOverflowCheck = false;
+ bool needAndAfter = false;
+
+ assert(genIsValidIntReg(targetReg));
+ assert(genIsValidIntReg(sourceReg));
+
+ instruction ins = INS_invalid;
+ emitAttr size = EA_UNKNOWN;
+
+ if (genTypeSize(srcType) < genTypeSize(dstType))
+ {
+ // Widening cast
+
+ // Is this an Overflow checking cast?
+ // We only need to handle one case, as the other casts can never overflow.
+ // cast from TYP_INT to TYP_ULONG
+ //
+ if (treeNode->gtOverflow() && (srcType == TYP_INT) && (dstType == TYP_ULONG))
+ {
+ requiresOverflowCheck = true;
+ size = EA_ATTR(genTypeSize(srcType));
+ ins = INS_mov;
+ }
+ else
+ {
+ // we need the source size
+ size = EA_ATTR(genTypeSize(srcType));
+ noway_assert(size < EA_PTRSIZE);
+
+ ins = ins_Move_Extend(srcType, castOp->InReg());
+
+ /*
+ Special case: ins_Move_Extend assumes the destination type is no bigger
+ than TYP_INT. movsx and movzx can already extend all the way to
+ 64-bit, and a regular 32-bit mov clears the high 32 bits (like the non-existant movzxd),
+ but for a sign extension from TYP_INT to TYP_LONG, we need to use movsxd opcode.
+ */
+ if (!isUnsignedSrc && !isUnsignedDst && (size == EA_4BYTE) && (genTypeSize(dstType) > EA_4BYTE))
+ {
+#ifdef _TARGET_X86_
+ NYI_X86("Cast to 64 bit for x86/RyuJIT");
+#else // !_TARGET_X86_
+ ins = INS_movsxd;
+#endif // !_TARGET_X86_
+ }
+
+ /*
+ Special case: for a cast of byte to char we first
+ have to expand the byte (w/ sign extension), then
+ mask off the high bits.
+ Use 'movsx' followed by 'and'
+ */
+ if (!isUnsignedSrc && isUnsignedDst && (genTypeSize(dstType) < EA_4BYTE))
+ {
+ noway_assert(genTypeSize(dstType) == EA_2BYTE && size == EA_1BYTE);
+ needAndAfter = true;
+ }
+ }
+ }
+ else
+ {
+ // Narrowing cast, or sign-changing cast
+ noway_assert(genTypeSize(srcType) >= genTypeSize(dstType));
+
+ // Is this an Overflow checking cast?
+ if (treeNode->gtOverflow())
+ {
+ requiresOverflowCheck = true;
+ size = EA_ATTR(genTypeSize(srcType));
+ ins = INS_mov;
+ }
+ else
+ {
+ size = EA_ATTR(genTypeSize(dstType));
+ ins = ins_Move_Extend(dstType, castOp->InReg());
+ }
+ }
+
+ noway_assert(ins != INS_invalid);
+
+ genConsumeReg(castOp);
+
+ if (requiresOverflowCheck)
+ {
+ ssize_t typeMin = 0;
+ ssize_t typeMax = 0;
+ ssize_t typeMask = 0;
+ bool needScratchReg = false;
+ bool signCheckOnly = false;
+
+ /* Do we need to compare the value, or just check masks */
+
+ switch (dstType)
+ {
+ case TYP_BYTE:
+ typeMask = ssize_t((int)0xFFFFFF80);
+ typeMin = SCHAR_MIN;
+ typeMax = SCHAR_MAX;
+ break;
+
+ case TYP_UBYTE:
+ typeMask = ssize_t((int)0xFFFFFF00L);
+ break;
+
+ case TYP_SHORT:
+ typeMask = ssize_t((int)0xFFFF8000);
+ typeMin = SHRT_MIN;
+ typeMax = SHRT_MAX;
+ break;
+
+ case TYP_CHAR:
+ typeMask = ssize_t((int)0xFFFF0000L);
+ break;
+
+ case TYP_INT:
+ if (srcType == TYP_UINT)
+ {
+ signCheckOnly = true;
+ }
+ else
+ {
+ typeMask = 0xFFFFFFFF80000000LL;
+ typeMin = INT_MIN;
+ typeMax = INT_MAX;
+ }
+ break;
+
+ case TYP_UINT:
+ if (srcType == TYP_INT)
+ {
+ signCheckOnly = true;
+ }
+ else
+ {
+ needScratchReg = true;
+ }
+ break;
+
+ case TYP_LONG:
+ noway_assert(srcType == TYP_ULONG);
+ signCheckOnly = true;
+ break;
+
+ case TYP_ULONG:
+ noway_assert((srcType == TYP_LONG) || (srcType == TYP_INT));
+ signCheckOnly = true;
+ break;
+
+ default:
+ NO_WAY("Unknown type");
+ return;
+ }
+
+ if (signCheckOnly)
+ {
+ // We only need to check for a negative value in sourceReg
+ inst_RV_IV(INS_cmp, sourceReg, 0, size);
+ genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
+ }
+ else
+ {
+ regNumber tmpReg = REG_NA;
+
+ if (needScratchReg)
+ {
+ // We need an additional temp register
+ // Make sure we have exactly one allocated.
+ assert(treeNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+ tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+ }
+
+ // When we are converting from unsigned or to unsigned, we
+ // will only have to check for any bits set using 'typeMask'
+ if (isUnsignedSrc || isUnsignedDst)
+ {
+ if (needScratchReg)
+ {
+ inst_RV_RV(INS_mov, tmpReg, sourceReg, TYP_LONG); // Move the 64-bit value to a writeable temp reg
+ inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, size, tmpReg, 32); // Shift right by 32 bits
+ genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW); // Thow if result shift is non-zero
+ }
+ else
+ {
+ noway_assert(typeMask != 0);
+ inst_RV_IV(INS_TEST, sourceReg, typeMask, size);
+ genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
+ }
+ }
+ else
+ {
+ // For a narrowing signed cast
+ //
+ // We must check the value is in a signed range.
+
+ // Compare with the MAX
+
+ noway_assert((typeMin != 0) && (typeMax != 0));
+
+ inst_RV_IV(INS_cmp, sourceReg, typeMax, size);
+ genJumpToThrowHlpBlk(EJ_jg, SCK_OVERFLOW);
+
+ // Compare with the MIN
+
+ inst_RV_IV(INS_cmp, sourceReg, typeMin, size);
+ genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
+ }
+ }
+
+ if (targetReg != sourceReg
+#ifdef _TARGET_AMD64_
+ // On amd64, we can hit this path for a same-register
+ // 4-byte to 8-byte widening conversion, and need to
+ // emit the instruction to set the high bits correctly.
+ || (EA_ATTR(genTypeSize(dstType)) == EA_8BYTE && EA_ATTR(genTypeSize(srcType)) == EA_4BYTE)
+#endif // _TARGET_AMD64_
+ )
+ inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
+ }
+ else // non-overflow checking cast
+ {
+ noway_assert(size < EA_PTRSIZE || srcType == dstType);
+
+ // We may have code transformations that result in casts where srcType is the same as dstType.
+ // e.g. Bug 824281, in which a comma is split by the rationalizer, leaving an assignment of a
+ // long constant to a long lclVar.
+ if (srcType == dstType)
+ {
+ ins = INS_mov;
+ }
+ /* Is the value sitting in a non-byte-addressable register? */
+ else if (castOp->InReg() && (size == EA_1BYTE) && !isByteReg(sourceReg))
+ {
+ if (isUnsignedDst)
+ {
+ // for unsigned values we can AND, so it need not be a byte register
+ ins = INS_AND;
+ }
+ else
+ {
+ // Move the value into a byte register
+ noway_assert(!"Signed byte convert from non-byte-addressable register");
+ }
+
+ /* Generate "mov targetReg, castOp->gtReg */
+ if (targetReg != sourceReg)
+ {
+ inst_RV_RV(INS_mov, targetReg, sourceReg, srcType);
+ }
+ }
+
+ if (ins == INS_AND)
+ {
+ noway_assert((needAndAfter == false) && isUnsignedDst);
+
+ /* Generate "and reg, MASK */
+ unsigned fillPattern;
+ if (size == EA_1BYTE)
+ {
+ fillPattern = 0xff;
+ }
+ else if (size == EA_2BYTE)
+ {
+ fillPattern = 0xffff;
+ }
+ else
+ {
+ fillPattern = 0xffffffff;
+ }
+
+ inst_RV_IV(INS_AND, targetReg, fillPattern, EA_4BYTE);
+ }
+#ifdef _TARGET_AMD64_
+ else if (ins == INS_movsxd)
+ {
+ noway_assert(!needAndAfter);
+ inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
+ }
+#endif // _TARGET_AMD64_
+ else if (ins == INS_mov)
+ {
+ noway_assert(!needAndAfter);
+ if (targetReg != sourceReg
+#ifdef _TARGET_AMD64_
+ // On amd64, 'mov' is the opcode used to zero-extend from
+ // 4 bytes to 8 bytes.
+ || (EA_ATTR(genTypeSize(dstType)) == EA_8BYTE && EA_ATTR(genTypeSize(srcType)) == EA_4BYTE)
+#endif // _TARGET_AMD64_
+ )
+ {
+ inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
+ }
+ }
+ else
+ {
+ noway_assert(ins == INS_movsx || ins == INS_movzx);
+
+ /* Generate "mov targetReg, castOp->gtReg */
+ inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
+
+ /* Mask off high bits for cast from byte to char */
+ if (needAndAfter)
+ {
+ noway_assert(genTypeSize(dstType) == 2 && ins == INS_movsx);
+ inst_RV_IV(INS_AND, targetReg, 0xFFFF, EA_4BYTE);
+ }
+ }
+ }
+
+ genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genFloatToFloatCast: Generate code for a cast between float and double
+//
+// Arguments:
+// treeNode - The GT_CAST node
+//
+// Return Value:
+// None.
+//
+// Assumptions:
+// Cast is a non-overflow conversion.
+// The treeNode must have an assigned register.
+// The cast is between float and double or vice versa.
+//
+void CodeGen::genFloatToFloatCast(GenTreePtr treeNode)
+{
+ // float <--> double conversions are always non-overflow ones
+ assert(treeNode->OperGet() == GT_CAST);
+ assert(!treeNode->gtOverflow());
+
+ regNumber targetReg = treeNode->gtRegNum;
+ assert(genIsValidFloatReg(targetReg));
+
+ GenTreePtr op1 = treeNode->gtOp.gtOp1;
+#ifdef DEBUG
+ // If not contained, must be a valid float reg.
+ if (!op1->isContained())
+ {
+ assert(genIsValidFloatReg(op1->gtRegNum));
+ }
+#endif
+
+ var_types dstType = treeNode->CastToType();
+ var_types srcType = op1->TypeGet();
+ assert(varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
+
+ genConsumeOperands(treeNode->AsOp());
+ if (srcType == dstType && targetReg == op1->gtRegNum)
+ {
+ // source and destinations types are the same and also reside in the same register.
+ // we just need to consume and produce the reg in this case.
+ ;
+ }
+ else
+ {
+ instruction ins = ins_FloatConv(dstType, srcType);
+ getEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
+ }
+
+ genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genIntToFloatCast: Generate code to cast an int/long to float/double
+//
+// Arguments:
+// treeNode - The GT_CAST node
+//
+// Return Value:
+// None.
+//
+// Assumptions:
+// Cast is a non-overflow conversion.
+// The treeNode must have an assigned register.
+// SrcType= int32/uint32/int64/uint64 and DstType=float/double.
+//
+void CodeGen::genIntToFloatCast(GenTreePtr treeNode)
+{
+ // int type --> float/double conversions are always non-overflow ones
+ assert(treeNode->OperGet() == GT_CAST);
+ assert(!treeNode->gtOverflow());
+
+ regNumber targetReg = treeNode->gtRegNum;
+ assert(genIsValidFloatReg(targetReg));
+
+ GenTreePtr op1 = treeNode->gtOp.gtOp1;
+#ifdef DEBUG
+ if (!op1->isContained())
+ {
+ assert(genIsValidIntReg(op1->gtRegNum));
+ }
+#endif
+
+ var_types dstType = treeNode->CastToType();
+ var_types srcType = op1->TypeGet();
+ assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
+
+#if !defined(_TARGET_64BIT_)
+ NYI_IF(varTypeIsLong(srcType), "Conversion from long to float");
+#endif // !defined(_TARGET_64BIT_)
+
+ // Since xarch emitter doesn't handle reporting gc-info correctly while casting away gc-ness we
+ // ensure srcType of a cast is non gc-type. Codegen should never see BYREF as source type except
+ // for GT_LCL_VAR_ADDR and GT_LCL_FLD_ADDR that represent stack addresses and can be considered
+ // as TYP_I_IMPL. In all other cases where src operand is a gc-type and not known to be on stack,
+ // Front-end (see fgMorphCast()) ensures this by assigning gc-type local to a non gc-type
+ // temp and using temp as operand of cast operation.
+ if (srcType == TYP_BYREF)
+ {
+ noway_assert(op1->OperGet() == GT_LCL_VAR_ADDR || op1->OperGet() == GT_LCL_FLD_ADDR);
+ srcType = TYP_I_IMPL;
+ }
+
+ // force the srcType to unsigned if GT_UNSIGNED flag is set
+ if (treeNode->gtFlags & GTF_UNSIGNED)
+ {
+ srcType = genUnsignedType(srcType);
+ }
+
+ noway_assert(!varTypeIsGC(srcType));
+
+ // We should never be seeing srcType whose size is not sizeof(int) nor sizeof(long).
+ // For conversions from byte/sbyte/int16/uint16 to float/double, we would expect
+ // either the front-end or lowering phase to have generated two levels of cast.
+ // The first one is for widening smaller int type to int32 and the second one is
+ // to the float/double.
+ emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
+ noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG))));
+
+ // Also we don't expect to see uint32 -> float/double and uint64 -> float conversions
+ // here since they should have been lowered apropriately.
+ noway_assert(srcType != TYP_UINT);
+ noway_assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT));
+
+ // To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used
+ // which does a partial write to lower 4/8 bytes of xmm register keeping the other
+ // upper bytes unmodified. If "cvtsi2ss/sd xmmReg, r32/r64" occurs inside a loop,
+ // the partial write could introduce a false dependency and could cause a stall
+ // if there are further uses of xmmReg. We have such a case occuring with a
+ // customer reported version of SpectralNorm benchmark, resulting in 2x perf
+ // regression. To avoid false dependency, we emit "xorps xmmReg, xmmReg" before
+ // cvtsi2ss/sd instruction.
+
+ genConsumeOperands(treeNode->AsOp());
+ getEmitter()->emitIns_R_R(INS_xorps, EA_4BYTE, treeNode->gtRegNum, treeNode->gtRegNum);
+
+ // Note that here we need to specify srcType that will determine
+ // the size of source reg/mem operand and rex.w prefix.
+ instruction ins = ins_FloatConv(dstType, TYP_INT);
+ getEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
+
+ // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction
+ // will interpret ULONG value as LONG. Hence we need to adjust the
+ // result if sign-bit of srcType is set.
+ if (srcType == TYP_ULONG)
+ {
+ // The instruction sequence below is less accurate than what clang
+ // and gcc generate. However, we keep the current sequence for backward compatiblity.
+ // If we change the instructions below, FloatingPointUtils::convertUInt64ToDobule
+ // should be also updated for consistent conversion result.
+ assert(dstType == TYP_DOUBLE);
+ assert(!op1->isContained());
+
+ // Set the flags without modifying op1.
+ // test op1Reg, op1Reg
+ inst_RV_RV(INS_test, op1->gtRegNum, op1->gtRegNum, srcType);
+
+ // No need to adjust result if op1 >= 0 i.e. positive
+ // Jge label
+ BasicBlock* label = genCreateTempLabel();
+ inst_JMP(EJ_jge, label);
+
+ // Adjust the result
+ // result = result + 0x43f00000 00000000
+ // addsd resultReg, 0x43f00000 00000000
+ GenTreePtr* cns = &u8ToDblBitmask;
+ if (*cns == nullptr)
+ {
+ double d;
+ static_assert_no_msg(sizeof(double) == sizeof(__int64));
+ *((__int64*)&d) = 0x43f0000000000000LL;
+
+ *cns = genMakeConst(&d, dstType, treeNode, true);
+ }
+ inst_RV_TT(INS_addsd, treeNode->gtRegNum, *cns);
+
+ genDefineTempLabel(label);
+ }
+
+ genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genFloatToIntCast: Generate code to cast float/double to int/long
+//
+// Arguments:
+// treeNode - The GT_CAST node
+//
+// Return Value:
+// None.
+//
+// Assumptions:
+// Cast is a non-overflow conversion.
+// The treeNode must have an assigned register.
+// SrcType=float/double and DstType= int32/uint32/int64/uint64
+//
+// TODO-XArch-CQ: (Low-pri) - generate in-line code when DstType = uint64
+//
+void CodeGen::genFloatToIntCast(GenTreePtr treeNode)
+{
+ // we don't expect to see overflow detecting float/double --> int type conversions here
+ // as they should have been converted into helper calls by front-end.
+ assert(treeNode->OperGet() == GT_CAST);
+ assert(!treeNode->gtOverflow());
+
+ regNumber targetReg = treeNode->gtRegNum;
+ assert(genIsValidIntReg(targetReg));
+
+ GenTreePtr op1 = treeNode->gtOp.gtOp1;
+#ifdef DEBUG
+ if (!op1->isContained())
+ {
+ assert(genIsValidFloatReg(op1->gtRegNum));
+ }
+#endif
+
+ var_types dstType = treeNode->CastToType();
+ var_types srcType = op1->TypeGet();
+ assert(varTypeIsFloating(srcType) && !varTypeIsFloating(dstType));
+
+ // We should never be seeing dstType whose size is neither sizeof(TYP_INT) nor sizeof(TYP_LONG).
+ // For conversions to byte/sbyte/int16/uint16 from float/double, we would expect the
+ // front-end or lowering phase to have generated two levels of cast. The first one is
+ // for float or double to int32/uint32 and the second one for narrowing int32/uint32 to
+ // the required smaller int type.
+ emitAttr dstSize = EA_ATTR(genTypeSize(dstType));
+ noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) || (dstSize == EA_ATTR(genTypeSize(TYP_LONG))));
+
+ // We shouldn't be seeing uint64 here as it should have been converted
+ // into a helper call by either front-end or lowering phase.
+ noway_assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))));
+
+ // If the dstType is TYP_UINT, we have 32-bits to encode the
+ // float number. Any of 33rd or above bits can be the sign bit.
+ // To acheive it we pretend as if we are converting it to a long.
+ if (varTypeIsUnsigned(dstType) && (dstSize == EA_ATTR(genTypeSize(TYP_INT))))
+ {
+ dstType = TYP_LONG;
+ }
+
+ // Note that we need to specify dstType here so that it will determine
+ // the size of destination integer register and also the rex.w prefix.
+ genConsumeOperands(treeNode->AsOp());
+ instruction ins = ins_FloatConv(TYP_INT, srcType);
+ getEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
+ genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genCkfinite: Generate code for ckfinite opcode.
+//
+// Arguments:
+// treeNode - The GT_CKFINITE node
+//
+// Return Value:
+// None.
+//
+// Assumptions:
+// GT_CKFINITE node has reserved an internal register.
+//
+// TODO-XArch-CQ - mark the operand as contained if known to be in
+// memory (e.g. field or an array element).
+//
+void CodeGen::genCkfinite(GenTreePtr treeNode)
+{
+ assert(treeNode->OperGet() == GT_CKFINITE);
+
+ GenTreePtr op1 = treeNode->gtOp.gtOp1;
+ var_types targetType = treeNode->TypeGet();
+ int expMask = (targetType == TYP_FLOAT) ? 0x7F800000 : 0x7FF00000; // Bit mask to extract exponent.
+ regNumber targetReg = treeNode->gtRegNum;
+
+ // Extract exponent into a register.
+ assert(treeNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+ regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+
+ genConsumeReg(op1);
+
+#ifdef _TARGET_64BIT_
+
+ // Copy the floating-point value to an integer register. If we copied a float to a long, then
+ // right-shift the value so the high 32 bits of the floating-point value sit in the low 32
+ // bits of the integer register.
+ instruction ins = ins_CopyFloatToInt(targetType, (targetType == TYP_FLOAT) ? TYP_INT : TYP_LONG);
+ inst_RV_RV(ins, op1->gtRegNum, tmpReg, targetType);
+ if (targetType == TYP_DOUBLE)
+ {
+ // right shift by 32 bits to get to exponent.
+ inst_RV_SH(INS_shr, EA_8BYTE, tmpReg, 32);
+ }
+
+ // Mask exponent with all 1's and check if the exponent is all 1's
+ inst_RV_IV(INS_and, tmpReg, expMask, EA_4BYTE);
+ inst_RV_IV(INS_cmp, tmpReg, expMask, EA_4BYTE);
+
+ // If exponent is all 1's, throw ArithmeticException
+ genJumpToThrowHlpBlk(EJ_je, SCK_ARITH_EXCPN);
+
+ // if it is a finite value copy it to targetReg
+ if (targetReg != op1->gtRegNum)
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
+ }
+
+#else // !_TARGET_64BIT_
+
+ // If the target type is TYP_DOUBLE, we want to extract the high 32 bits into the register.
+ // There is no easy way to do this. To not require an extra register, we'll use shuffles
+ // to move the high 32 bits into the low 32 bits, then then shuffle it back, since we
+ // need to produce the value into the target register.
+ //
+ // For TYP_DOUBLE, we'll generate (for targetReg != op1->gtRegNum):
+ // movaps targetReg, op1->gtRegNum
+ // shufps targetReg, targetReg, 0xB1 // WZYX => ZWXY
+ // mov_xmm2i tmpReg, targetReg // tmpReg <= Y
+ // and tmpReg, <mask>
+ // cmp tmpReg, <mask>
+ // je <throw block>
+ // movaps targetReg, op1->gtRegNum // copy the value again, instead of un-shuffling it
+ //
+ // For TYP_DOUBLE with (targetReg == op1->gtRegNum):
+ // shufps targetReg, targetReg, 0xB1 // WZYX => ZWXY
+ // mov_xmm2i tmpReg, targetReg // tmpReg <= Y
+ // and tmpReg, <mask>
+ // cmp tmpReg, <mask>
+ // je <throw block>
+ // shufps targetReg, targetReg, 0xB1 // ZWXY => WZYX
+ //
+ // For TYP_FLOAT, it's the same as _TARGET_64BIT_:
+ // mov_xmm2i tmpReg, targetReg // tmpReg <= low 32 bits
+ // and tmpReg, <mask>
+ // cmp tmpReg, <mask>
+ // je <throw block>
+ // movaps targetReg, op1->gtRegNum // only if targetReg != op1->gtRegNum
+
+ regNumber copyToTmpSrcReg; // The register we'll copy to the integer temp.
+
+ if (targetType == TYP_DOUBLE)
+ {
+ if (targetReg != op1->gtRegNum)
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
+ }
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, targetReg, 0xb1);
+ copyToTmpSrcReg = targetReg;
+ }
+ else
+ {
+ copyToTmpSrcReg = op1->gtRegNum;
+ }
+
+ // Copy only the low 32 bits. This will be the high order 32 bits of the floating-point
+ // value, no matter the floating-point type.
+ inst_RV_RV(ins_CopyFloatToInt(TYP_FLOAT, TYP_INT), copyToTmpSrcReg, tmpReg, TYP_FLOAT);
+
+ // Mask exponent with all 1's and check if the exponent is all 1's
+ inst_RV_IV(INS_and, tmpReg, expMask, EA_4BYTE);
+ inst_RV_IV(INS_cmp, tmpReg, expMask, EA_4BYTE);
+
+ // If exponent is all 1's, throw ArithmeticException
+ genJumpToThrowHlpBlk(EJ_je, SCK_ARITH_EXCPN);
+
+ if (targetReg != op1->gtRegNum)
+ {
+ // In both the TYP_FLOAT and TYP_DOUBLE case, the op1 register is untouched,
+ // so copy it to the targetReg. This is faster and smaller for TYP_DOUBLE
+ // than re-shuffling the targetReg.
+ inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
+ }
+ else if (targetType == TYP_DOUBLE)
+ {
+ // We need to re-shuffle the targetReg to get the correct result.
+ inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, targetReg, 0xb1);
+ }
+
+#endif // !_TARGET_64BIT_
+
+ genProduceReg(treeNode);
+}
+
+#ifdef _TARGET_AMD64_
+int CodeGenInterface::genSPtoFPdelta()
+{
+ int delta;
+
+#ifdef PLATFORM_UNIX
+
+ // We require frame chaining on Unix to support native tool unwinding (such as
+ // unwinding by the native debugger). We have a CLR-only extension to the
+ // unwind codes (UWOP_SET_FPREG_LARGE) to support SP->FP offsets larger than 240.
+ // If Unix ever supports EnC, the RSP == RBP assumption will have to be reevaluated.
+ delta = genTotalFrameSize();
+
+#else // !PLATFORM_UNIX
+
+ // As per Amd64 ABI, RBP offset from initial RSP can be between 0 and 240 if
+ // RBP needs to be reported in unwind codes. This case would arise for methods
+ // with localloc.
+ if (compiler->compLocallocUsed)
+ {
+ // We cannot base delta computation on compLclFrameSize since it changes from
+ // tentative to final frame layout and hence there is a possibility of
+ // under-estimating offset of vars from FP, which in turn results in under-
+ // estimating instruction size.
+ //
+ // To be predictive and so as never to under-estimate offset of vars from FP
+ // we will always position FP at min(240, outgoing arg area size).
+ delta = Min(240, (int)compiler->lvaOutgoingArgSpaceSize);
+ }
+ else if (compiler->opts.compDbgEnC)
+ {
+ // vm assumption on EnC methods is that rsp and rbp are equal
+ delta = 0;
+ }
+ else
+ {
+ delta = genTotalFrameSize();
+ }
+
+#endif // !PLATFORM_UNIX
+
+ return delta;
+}
+
+//---------------------------------------------------------------------
+// genTotalFrameSize - return the total size of the stack frame, including local size,
+// callee-saved register size, etc. For AMD64, this does not include the caller-pushed
+// return address.
+//
+// Return value:
+// Total frame size
+//
+
+int CodeGenInterface::genTotalFrameSize()
+{
+ assert(!IsUninitialized(compiler->compCalleeRegsPushed));
+
+ int totalFrameSize = compiler->compCalleeRegsPushed * REGSIZE_BYTES + compiler->compLclFrameSize;
+
+ assert(totalFrameSize >= 0);
+ return totalFrameSize;
+}
+
+//---------------------------------------------------------------------
+// genCallerSPtoFPdelta - return the offset from Caller-SP to the frame pointer.
+// This number is going to be negative, since the Caller-SP is at a higher
+// address than the frame pointer.
+//
+// There must be a frame pointer to call this function!
+//
+// We can't compute this directly from the Caller-SP, since the frame pointer
+// is based on a maximum delta from Initial-SP, so first we find SP, then
+// compute the FP offset.
+
+int CodeGenInterface::genCallerSPtoFPdelta()
+{
+ assert(isFramePointerUsed());
+ int callerSPtoFPdelta;
+
+ callerSPtoFPdelta = genCallerSPtoInitialSPdelta() + genSPtoFPdelta();
+
+ assert(callerSPtoFPdelta <= 0);
+ return callerSPtoFPdelta;
+}
+
+//---------------------------------------------------------------------
+// genCallerSPtoInitialSPdelta - return the offset from Caller-SP to Initial SP.
+//
+// This number will be negative.
+
+int CodeGenInterface::genCallerSPtoInitialSPdelta()
+{
+ int callerSPtoSPdelta = 0;
+
+ callerSPtoSPdelta -= genTotalFrameSize();
+ callerSPtoSPdelta -= REGSIZE_BYTES; // caller-pushed return address
+
+ // compCalleeRegsPushed does not account for the frame pointer
+ // TODO-Cleanup: shouldn't this be part of genTotalFrameSize?
+ if (isFramePointerUsed())
+ {
+ callerSPtoSPdelta -= REGSIZE_BYTES;
+ }
+
+ assert(callerSPtoSPdelta <= 0);
+ return callerSPtoSPdelta;
+}
+#endif // _TARGET_AMD64_
+
+//-----------------------------------------------------------------------------------------
+// genSSE2BitwiseOp - generate SSE2 code for the given oper as "Operand BitWiseOp BitMask"
+//
+// Arguments:
+// treeNode - tree node
+//
+// Return value:
+// None
+//
+// Assumptions:
+// i) tree oper is one of GT_NEG or GT_INTRINSIC Abs()
+// ii) tree type is floating point type.
+// iii) caller of this routine needs to call genProduceReg()
+void CodeGen::genSSE2BitwiseOp(GenTreePtr treeNode)
+{
+ regNumber targetReg = treeNode->gtRegNum;
+ var_types targetType = treeNode->TypeGet();
+ assert(varTypeIsFloating(targetType));
+
+ float f;
+ double d;
+ GenTreePtr* bitMask = nullptr;
+ instruction ins = INS_invalid;
+ void* cnsAddr = nullptr;
+ bool dblAlign = false;
+
+ switch (treeNode->OperGet())
+ {
+ case GT_NEG:
+ // Neg(x) = flip the sign bit.
+ // Neg(f) = f ^ 0x80000000
+ // Neg(d) = d ^ 0x8000000000000000
+ ins = genGetInsForOper(GT_XOR, targetType);
+ if (targetType == TYP_FLOAT)
+ {
+ bitMask = &negBitmaskFlt;
+
+ static_assert_no_msg(sizeof(float) == sizeof(int));
+ *((int*)&f) = 0x80000000;
+ cnsAddr = &f;
+ }
+ else
+ {
+ bitMask = &negBitmaskDbl;
+
+ static_assert_no_msg(sizeof(double) == sizeof(__int64));
+ *((__int64*)&d) = 0x8000000000000000LL;
+ cnsAddr = &d;
+ dblAlign = true;
+ }
+ break;
+
+ case GT_INTRINSIC:
+ assert(treeNode->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs);
+
+ // Abs(x) = set sign-bit to zero
+ // Abs(f) = f & 0x7fffffff
+ // Abs(d) = d & 0x7fffffffffffffff
+ ins = genGetInsForOper(GT_AND, targetType);
+ if (targetType == TYP_FLOAT)
+ {
+ bitMask = &absBitmaskFlt;
+
+ static_assert_no_msg(sizeof(float) == sizeof(int));
+ *((int*)&f) = 0x7fffffff;
+ cnsAddr = &f;
+ }
+ else
+ {
+ bitMask = &absBitmaskDbl;
+
+ static_assert_no_msg(sizeof(double) == sizeof(__int64));
+ *((__int64*)&d) = 0x7fffffffffffffffLL;
+ cnsAddr = &d;
+ dblAlign = true;
+ }
+ break;
+
+ default:
+ assert(!"genSSE2: unsupported oper");
+ unreached();
+ break;
+ }
+
+ if (*bitMask == nullptr)
+ {
+ assert(cnsAddr != nullptr);
+ *bitMask = genMakeConst(cnsAddr, targetType, treeNode, dblAlign);
+ }
+
+ // We need an additional register for bitmask.
+ // Make sure we have one allocated.
+ assert(treeNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+ regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+
+ // Move operand into targetReg only if the reg reserved for
+ // internal purpose is not the same as targetReg.
+ GenTreePtr op1 = treeNode->gtOp.gtOp1;
+ assert(!op1->isContained());
+ regNumber operandReg = genConsumeReg(op1);
+ if (tmpReg != targetReg)
+ {
+ if (operandReg != targetReg)
+ {
+ inst_RV_RV(ins_Copy(targetType), targetReg, operandReg, targetType);
+ }
+
+ operandReg = tmpReg;
+ }
+
+ inst_RV_TT(ins_Load(targetType, false), tmpReg, *bitMask);
+ assert(ins != INS_invalid);
+ inst_RV_RV(ins, targetReg, operandReg, targetType);
+}
+
+//---------------------------------------------------------------------
+// genIntrinsic - generate code for a given intrinsic
+//
+// Arguments
+// treeNode - the GT_INTRINSIC node
+//
+// Return value:
+// None
+//
+void CodeGen::genIntrinsic(GenTreePtr treeNode)
+{
+ // Right now only Sqrt/Abs are treated as math intrinsics.
+ switch (treeNode->gtIntrinsic.gtIntrinsicId)
+ {
+ case CORINFO_INTRINSIC_Sqrt:
+ noway_assert(treeNode->TypeGet() == TYP_DOUBLE);
+ genConsumeOperands(treeNode->AsOp());
+ getEmitter()->emitInsBinary(ins_FloatSqrt(treeNode->TypeGet()), emitTypeSize(treeNode), treeNode,
+ treeNode->gtOp.gtOp1);
+ break;
+
+ case CORINFO_INTRINSIC_Abs:
+ genSSE2BitwiseOp(treeNode);
+ break;
+
+ default:
+ assert(!"genIntrinsic: Unsupported intrinsic");
+ unreached();
+ }
+
+ genProduceReg(treeNode);
+}
+
+//-------------------------------------------------------------------------- //
+// getBaseVarForPutArgStk - returns the baseVarNum for passing a stack arg.
+//
+// Arguments
+// treeNode - the GT_PUTARG_STK node
+//
+// Return value:
+// The number of the base variable.
+//
+// Note:
+// If tail call the outgoing args are placed in the caller's incoming arg stack space.
+// Otherwise, they go in the outgoing arg area on the current frame.
+//
+// On Windows the caller always creates slots (homing space) in its frame for the
+// first 4 arguments of a callee (register passed args). So, the baseVarNum is always 0.
+// For System V systems there is no such calling convention requirement, and the code needs to find
+// the first stack passed argument from the caller. This is done by iterating over
+// all the lvParam variables and finding the first with lvArgReg equals to REG_STK.
+//
+unsigned CodeGen::getBaseVarForPutArgStk(GenTreePtr treeNode)
+{
+ assert(treeNode->OperGet() == GT_PUTARG_STK);
+
+ unsigned baseVarNum;
+
+#if FEATURE_FASTTAILCALL
+ bool putInIncomingArgArea = treeNode->AsPutArgStk()->putInIncomingArgArea;
+#else
+ const bool putInIncomingArgArea = false;
+#endif
+
+ // Whether to setup stk arg in incoming or out-going arg area?
+ // Fast tail calls implemented as epilog+jmp = stk arg is setup in incoming arg area.
+ // All other calls - stk arg is setup in out-going arg area.
+ if (putInIncomingArgArea)
+ {
+ // See the note in the function header re: finding the first stack passed argument.
+ baseVarNum = getFirstArgWithStackSlot();
+ assert(baseVarNum != BAD_VAR_NUM);
+
+#ifdef DEBUG
+ // This must be a fast tail call.
+ assert(treeNode->AsPutArgStk()->gtCall->AsCall()->IsFastTailCall());
+
+ // Since it is a fast tail call, the existence of first incoming arg is guaranteed
+ // because fast tail call requires that in-coming arg area of caller is >= out-going
+ // arg area required for tail call.
+ LclVarDsc* varDsc = &(compiler->lvaTable[baseVarNum]);
+ assert(varDsc != nullptr);
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+ assert(!varDsc->lvIsRegArg && varDsc->lvArgReg == REG_STK);
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+ // On Windows this assert is always true. The first argument will always be in REG_ARG_0 or REG_FLTARG_0.
+ assert(varDsc->lvIsRegArg && (varDsc->lvArgReg == REG_ARG_0 || varDsc->lvArgReg == REG_FLTARG_0));
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+#endif // !DEBUG
+ }
+ else
+ {
+#if FEATURE_FIXED_OUT_ARGS
+ baseVarNum = compiler->lvaOutgoingArgSpaceVar;
+#else // !FEATURE_FIXED_OUT_ARGS
+ NYI_X86("Stack args for x86/RyuJIT");
+ baseVarNum = BAD_VAR_NUM;
+#endif // !FEATURE_FIXED_OUT_ARGS
+ }
+
+ return baseVarNum;
+}
+
+//--------------------------------------------------------------------- //
+// genPutStructArgStk - generate code for passing an arg on the stack.
+//
+// Arguments
+// treeNode - the GT_PUTARG_STK node
+// targetType - the type of the treeNode
+//
+// Return value:
+// None
+//
+void CodeGen::genPutArgStk(GenTreePtr treeNode)
+{
+ var_types targetType = treeNode->TypeGet();
+#ifdef _TARGET_X86_
+ noway_assert(targetType != TYP_STRUCT);
+
+ // The following logic is applicable for x86 arch.
+ assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+
+ GenTreePtr data = treeNode->gtOp.gtOp1;
+
+ // On a 32-bit target, all of the long arguments have been decomposed into
+ // a separate putarg_stk for each of the upper and lower halves.
+ noway_assert(targetType != TYP_LONG);
+
+ int argSize = genTypeSize(genActualType(targetType));
+ genStackLevel += argSize;
+
+ // TODO-Cleanup: Handle this in emitInsMov() in emitXArch.cpp?
+ if (data->isContainedIntOrIImmed())
+ {
+ if (data->IsIconHandle())
+ {
+ inst_IV_handle(INS_push, data->gtIntCon.gtIconVal);
+ }
+ else
+ {
+ inst_IV(INS_push, data->gtIntCon.gtIconVal);
+ }
+ }
+ else if (data->isContained())
+ {
+ NYI_X86("Contained putarg_stk of non-constant");
+ }
+ else
+ {
+ genConsumeReg(data);
+ if (varTypeIsIntegralOrI(targetType))
+ {
+ inst_RV(INS_push, data->gtRegNum, targetType);
+ }
+ else
+ {
+ // Decrement SP.
+ inst_RV_IV(INS_sub, REG_SPBASE, argSize, emitActualTypeSize(TYP_I_IMPL));
+ getEmitter()->emitIns_AR_R(ins_Store(targetType), emitTypeSize(targetType), data->gtRegNum, REG_SPBASE, 0);
+ }
+ }
+#else // !_TARGET_X86_
+ {
+ unsigned baseVarNum = getBaseVarForPutArgStk(treeNode);
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+ if (varTypeIsStruct(targetType))
+ {
+ genPutStructArgStk(treeNode, baseVarNum);
+ return;
+ }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+ noway_assert(targetType != TYP_STRUCT);
+ assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
+
+ // Get argument offset on stack.
+ // Here we cross check that argument offset hasn't changed from lowering to codegen since
+ // we are storing arg slot number in GT_PUTARG_STK node in lowering phase.
+ int argOffset = treeNode->AsPutArgStk()->getArgOffset();
+
+#ifdef DEBUG
+ fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(treeNode->AsPutArgStk()->gtCall, treeNode);
+ assert(curArgTabEntry);
+ assert(argOffset == (int)curArgTabEntry->slotNum * TARGET_POINTER_SIZE);
+#endif
+
+ GenTreePtr data = treeNode->gtGetOp1();
+
+ if (data->isContained())
+ {
+ getEmitter()->emitIns_S_I(ins_Store(targetType), emitTypeSize(targetType), baseVarNum, argOffset,
+ (int)data->AsIntConCommon()->IconValue());
+ }
+ else
+ {
+ genConsumeReg(data);
+ getEmitter()->emitIns_S_R(ins_Store(targetType), emitTypeSize(targetType), data->gtRegNum, baseVarNum,
+ argOffset);
+ }
+ }
+#endif // !_TARGET_X86_
+}
+
+#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+//---------------------------------------------------------------------
+// genPutStructArgStk - generate code for copying a struct arg on the stack by value.
+// In case there are references to heap object in the struct,
+// it generates the gcinfo as well.
+//
+// Arguments
+// treeNode - the GT_PUTARG_STK node
+// baseVarNum - the variable number relative to which to put the argument on the stack.
+// For tail calls this is the baseVarNum = 0.
+// For non tail calls this is the outgoingArgSpace.
+//
+// Return value:
+// None
+//
+void CodeGen::genPutStructArgStk(GenTreePtr treeNode, unsigned baseVarNum)
+{
+ assert(treeNode->OperGet() == GT_PUTARG_STK);
+ assert(baseVarNum != BAD_VAR_NUM);
+
+ var_types targetType = treeNode->TypeGet();
+
+ if (varTypeIsSIMD(targetType))
+ {
+ regNumber srcReg = genConsumeReg(treeNode->gtGetOp1());
+ assert((srcReg != REG_NA) && (genIsValidFloatReg(srcReg)));
+ getEmitter()->emitIns_S_R(ins_Store(targetType), emitTypeSize(targetType), srcReg, baseVarNum,
+ treeNode->AsPutArgStk()->getArgOffset());
+ return;
+ }
+
+ assert(targetType == TYP_STRUCT);
+
+ GenTreePutArgStk* putArgStk = treeNode->AsPutArgStk();
+ if (putArgStk->gtNumberReferenceSlots == 0)
+ {
+ switch (putArgStk->gtPutArgStkKind)
+ {
+ case GenTreePutArgStk::PutArgStkKindRepInstr:
+ genStructPutArgRepMovs(putArgStk, baseVarNum);
+ break;
+ case GenTreePutArgStk::PutArgStkKindUnroll:
+ genStructPutArgUnroll(putArgStk, baseVarNum);
+ break;
+ default:
+ unreached();
+ }
+ }
+ else
+ {
+ // No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always.
+
+ // Consume these registers.
+ // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing").
+ genConsumePutStructArgStk(putArgStk, REG_RDI, REG_RSI, REG_NA, baseVarNum);
+ GenTreePtr dstAddr = putArgStk;
+ GenTreePtr src = putArgStk->gtOp.gtOp1;
+ assert(src->OperGet() == GT_OBJ);
+ GenTreePtr srcAddr = src->gtGetOp1();
+
+ unsigned slots = putArgStk->gtNumSlots;
+
+ // We are always on the stack we don't need to use the write barrier.
+ BYTE* gcPtrs = putArgStk->gtGcPtrs;
+ unsigned gcPtrCount = putArgStk->gtNumberReferenceSlots;
+
+ unsigned i = 0;
+ unsigned copiedSlots = 0;
+ while (i < slots)
+ {
+ switch (gcPtrs[i])
+ {
+ case TYPE_GC_NONE:
+ // Let's see if we can use rep movsq instead of a sequence of movsq instructions
+ // to save cycles and code size.
+ {
+ unsigned nonGcSlotCount = 0;
+
+ do
+ {
+ nonGcSlotCount++;
+ i++;
+ } while (i < slots && gcPtrs[i] == TYPE_GC_NONE);
+
+ // If we have a very small contiguous non-gc region, it's better just to
+ // emit a sequence of movsq instructions
+ if (nonGcSlotCount < CPOBJ_NONGC_SLOTS_LIMIT)
+ {
+ copiedSlots += nonGcSlotCount;
+ while (nonGcSlotCount > 0)
+ {
+ instGen(INS_movsq);
+ nonGcSlotCount--;
+ }
+ }
+ else
+ {
+ getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, nonGcSlotCount);
+ copiedSlots += nonGcSlotCount;
+ instGen(INS_r_movsq);
+ }
+ }
+ break;
+
+ case TYPE_GC_REF: // Is an object ref
+ case TYPE_GC_BYREF: // Is an interior pointer - promote it but don't scan it
+ {
+ // We have a GC (byref or ref) pointer
+ // TODO-Amd64-Unix: Here a better solution (for code size and CQ) would be to use movsq instruction,
+ // but the logic for emitting a GC info record is not available (it is internal for the emitter
+ // only.) See emitGCVarLiveUpd function. If we could call it separately, we could do
+ // instGen(INS_movsq); and emission of gc info.
+
+ var_types memType;
+ if (gcPtrs[i] == TYPE_GC_REF)
+ {
+ memType = TYP_REF;
+ }
+ else
+ {
+ assert(gcPtrs[i] == TYPE_GC_BYREF);
+ memType = TYP_BYREF;
+ }
+
+ getEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), REG_RCX, REG_RSI, 0);
+ getEmitter()->emitIns_S_R(ins_Store(memType), emitTypeSize(memType), REG_RCX, baseVarNum,
+ ((copiedSlots + putArgStk->gtSlotNum) * TARGET_POINTER_SIZE));
+
+ // Source for the copy operation.
+ // If a LocalAddr, use EA_PTRSIZE - copy from stack.
+ // If not a LocalAddr, use EA_BYREF - the source location is not on the stack.
+ getEmitter()->emitIns_R_I(INS_add, ((src->OperIsLocalAddr()) ? EA_PTRSIZE : EA_BYREF), REG_RSI,
+ TARGET_POINTER_SIZE);
+
+ // Always copying to the stack - outgoing arg area
+ // (or the outgoing arg area of the caller for a tail call) - use EA_PTRSIZE.
+ getEmitter()->emitIns_R_I(INS_add, EA_PTRSIZE, REG_RDI, TARGET_POINTER_SIZE);
+ copiedSlots++;
+ gcPtrCount--;
+ i++;
+ }
+ break;
+
+ default:
+ unreached();
+ break;
+ }
+ }
+
+ assert(gcPtrCount == 0);
+ }
+}
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+/*****************************************************************************
+ *
+ * Create and record GC Info for the function.
+ */
+#ifdef _TARGET_AMD64_
+void
+#else // !_TARGET_AMD64_
+void*
+#endif // !_TARGET_AMD64_
+CodeGen::genCreateAndStoreGCInfo(unsigned codeSize, unsigned prologSize, unsigned epilogSize DEBUGARG(void* codePtr))
+{
+#ifdef JIT32_GCENCODER
+ return genCreateAndStoreGCInfoJIT32(codeSize, prologSize, epilogSize DEBUGARG(codePtr));
+#else // !JIT32_GCENCODER
+ genCreateAndStoreGCInfoX64(codeSize, prologSize DEBUGARG(codePtr));
+#endif // !JIT32_GCENCODER
+}
+
+#ifdef JIT32_GCENCODER
+void* CodeGen::genCreateAndStoreGCInfoJIT32(unsigned codeSize,
+ unsigned prologSize,
+ unsigned epilogSize DEBUGARG(void* codePtr))
+{
+ BYTE headerBuf[64];
+ InfoHdr header;
+
+ int s_cached;
+#ifdef DEBUG
+ size_t headerSize =
+#endif
+ compiler->compInfoBlkSize =
+ gcInfo.gcInfoBlockHdrSave(headerBuf, 0, codeSize, prologSize, epilogSize, &header, &s_cached);
+
+ size_t argTabOffset = 0;
+ size_t ptrMapSize = gcInfo.gcPtrTableSize(header, codeSize, &argTabOffset);
+
+#if DISPLAY_SIZES
+
+ if (genInterruptible)
+ {
+ gcHeaderISize += compiler->compInfoBlkSize;
+ gcPtrMapISize += ptrMapSize;
+ }
+ else
+ {
+ gcHeaderNSize += compiler->compInfoBlkSize;
+ gcPtrMapNSize += ptrMapSize;
+ }
+
+#endif // DISPLAY_SIZES
+
+ compiler->compInfoBlkSize += ptrMapSize;
+
+ /* Allocate the info block for the method */
+
+ compiler->compInfoBlkAddr = (BYTE*)compiler->info.compCompHnd->allocGCInfo(compiler->compInfoBlkSize);
+
+#if 0 // VERBOSE_SIZES
+ // TODO-X86-Cleanup: 'dataSize', below, is not defined
+
+// if (compiler->compInfoBlkSize > codeSize && compiler->compInfoBlkSize > 100)
+ {
+ printf("[%7u VM, %7u+%7u/%7u x86 %03u/%03u%%] %s.%s\n",
+ compiler->info.compILCodeSize,
+ compiler->compInfoBlkSize,
+ codeSize + dataSize,
+ codeSize + dataSize - prologSize - epilogSize,
+ 100 * (codeSize + dataSize) / compiler->info.compILCodeSize,
+ 100 * (codeSize + dataSize + compiler->compInfoBlkSize) / compiler->info.compILCodeSize,
+ compiler->info.compClassName,
+ compiler->info.compMethodName);
+}
+
+#endif
+
+ /* Fill in the info block and return it to the caller */
+
+ void* infoPtr = compiler->compInfoBlkAddr;
+
+ /* Create the method info block: header followed by GC tracking tables */
+
+ compiler->compInfoBlkAddr +=
+ gcInfo.gcInfoBlockHdrSave(compiler->compInfoBlkAddr, -1, codeSize, prologSize, epilogSize, &header, &s_cached);
+
+ assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize);
+ compiler->compInfoBlkAddr = gcInfo.gcPtrTableSave(compiler->compInfoBlkAddr, header, codeSize, &argTabOffset);
+ assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize + ptrMapSize);
+
+#ifdef DEBUG
+
+ if (0)
+ {
+ BYTE* temp = (BYTE*)infoPtr;
+ unsigned size = compiler->compInfoBlkAddr - temp;
+ BYTE* ptab = temp + headerSize;
+
+ noway_assert(size == headerSize + ptrMapSize);
+
+ printf("Method info block - header [%u bytes]:", headerSize);
+
+ for (unsigned i = 0; i < size; i++)
+ {
+ if (temp == ptab)
+ {
+ printf("\nMethod info block - ptrtab [%u bytes]:", ptrMapSize);
+ printf("\n %04X: %*c", i & ~0xF, 3 * (i & 0xF), ' ');
+ }
+ else
+ {
+ if (!(i % 16))
+ printf("\n %04X: ", i);
+ }
+
+ printf("%02X ", *temp++);
+ }
+
+ printf("\n");
+ }
+
+#endif // DEBUG
+
+#if DUMP_GC_TABLES
+
+ if (compiler->opts.dspGCtbls)
+ {
+ const BYTE* base = (BYTE*)infoPtr;
+ unsigned size;
+ unsigned methodSize;
+ InfoHdr dumpHeader;
+
+ printf("GC Info for method %s\n", compiler->info.compFullName);
+ printf("GC info size = %3u\n", compiler->compInfoBlkSize);
+
+ size = gcInfo.gcInfoBlockHdrDump(base, &dumpHeader, &methodSize);
+ // printf("size of header encoding is %3u\n", size);
+ printf("\n");
+
+ if (compiler->opts.dspGCtbls)
+ {
+ base += size;
+ size = gcInfo.gcDumpPtrTable(base, dumpHeader, methodSize);
+ // printf("size of pointer table is %3u\n", size);
+ printf("\n");
+ noway_assert(compiler->compInfoBlkAddr == (base + size));
+ }
+ }
+
+#ifdef DEBUG
+ if (jitOpts.testMask & 128)
+ {
+ for (unsigned offs = 0; offs < codeSize; offs++)
+ {
+ gcInfo.gcFindPtrsInFrame(infoPtr, codePtr, offs);
+ }
+ }
+#endif // DEBUG
+#endif // DUMP_GC_TABLES
+
+ /* Make sure we ended up generating the expected number of bytes */
+
+ noway_assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + compiler->compInfoBlkSize);
+
+ return infoPtr;
+}
+
+#else // !JIT32_GCENCODER
+void CodeGen::genCreateAndStoreGCInfoX64(unsigned codeSize, unsigned prologSize DEBUGARG(void* codePtr))
+{
+ IAllocator* allowZeroAlloc = new (compiler, CMK_GC) AllowZeroAllocator(compiler->getAllocatorGC());
+ GcInfoEncoder* gcInfoEncoder = new (compiler, CMK_GC)
+ GcInfoEncoder(compiler->info.compCompHnd, compiler->info.compMethodInfo, allowZeroAlloc, NOMEM);
+ assert(gcInfoEncoder);
+
+ // Follow the code pattern of the x86 gc info encoder (genCreateAndStoreGCInfoJIT32).
+ gcInfo.gcInfoBlockHdrSave(gcInfoEncoder, codeSize, prologSize);
+
+ // First we figure out the encoder ID's for the stack slots and registers.
+ gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_ASSIGN_SLOTS);
+ // Now we've requested all the slots we'll need; "finalize" these (make more compact data structures for them).
+ gcInfoEncoder->FinalizeSlotIds();
+ // Now we can actually use those slot ID's to declare live ranges.
+ gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_DO_WORK);
+
+#if defined(DEBUGGING_SUPPORT)
+ if (compiler->opts.compDbgEnC)
+ {
+ // what we have to preserve is called the "frame header" (see comments in VM\eetwain.cpp)
+ // which is:
+ // -return address
+ // -saved off RBP
+ // -saved 'this' pointer and bool for synchronized methods
+
+ // 4 slots for RBP + return address + RSI + RDI
+ int preservedAreaSize = 4 * REGSIZE_BYTES;
+
+ if (compiler->info.compFlags & CORINFO_FLG_SYNCH)
+ {
+ if (!(compiler->info.compFlags & CORINFO_FLG_STATIC))
+ {
+ preservedAreaSize += REGSIZE_BYTES;
+ }
+
+ // bool in synchronized methods that tracks whether the lock has been taken (takes 4 bytes on stack)
+ preservedAreaSize += 4;
+ }
+
+ // Used to signal both that the method is compiled for EnC, and also the size of the block at the top of the
+ // frame
+ gcInfoEncoder->SetSizeOfEditAndContinuePreservedArea(preservedAreaSize);
+ }
+#endif
+
+ gcInfoEncoder->Build();
+
+ // GC Encoder automatically puts the GC info in the right spot using ICorJitInfo::allocGCInfo(size_t)
+ // let's save the values anyway for debugging purposes
+ compiler->compInfoBlkAddr = gcInfoEncoder->Emit();
+ compiler->compInfoBlkSize = 0; // not exposed by the GCEncoder interface
+}
+#endif // !JIT32_GCENCODER
+
+/*****************************************************************************
+ * Emit a call to a helper function.
+ *
+ */
+
+void CodeGen::genEmitHelperCall(unsigned helper, int argSize, emitAttr retSize, regNumber callTargetReg)
+{
+ void* addr = nullptr;
+ void* pAddr = nullptr;
+
+ emitter::EmitCallType callType = emitter::EC_FUNC_TOKEN;
+ addr = compiler->compGetHelperFtn((CorInfoHelpFunc)helper, &pAddr);
+ regNumber callTarget = REG_NA;
+ regMaskTP killMask = compiler->compHelperCallKillSet((CorInfoHelpFunc)helper);
+
+ if (!addr)
+ {
+ assert(pAddr != nullptr);
+
+ // Absolute indirect call addr
+ // Note: Order of checks is important. First always check for pc-relative and next
+ // zero-relative. Because the former encoding is 1-byte smaller than the latter.
+ if (genCodeIndirAddrCanBeEncodedAsPCRelOffset((size_t)pAddr) ||
+ genCodeIndirAddrCanBeEncodedAsZeroRelOffset((size_t)pAddr))
+ {
+ // generate call whose target is specified by 32-bit offset relative to PC or zero.
+ callType = emitter::EC_FUNC_TOKEN_INDIR;
+ addr = pAddr;
+ }
+ else
+ {
+#ifdef _TARGET_AMD64_
+ // If this indirect address cannot be encoded as 32-bit offset relative to PC or Zero,
+ // load it into REG_HELPER_CALL_TARGET and use register indirect addressing mode to
+ // make the call.
+ // mov reg, addr
+ // call [reg]
+
+ if (callTargetReg == REG_NA)
+ {
+ // If a callTargetReg has not been explicitly provided, we will use REG_DEFAULT_HELPER_CALL_TARGET, but
+ // this is only a valid assumption if the helper call is known to kill REG_DEFAULT_HELPER_CALL_TARGET.
+ callTargetReg = REG_DEFAULT_HELPER_CALL_TARGET;
+ regMaskTP callTargetMask = genRegMask(callTargetReg);
+ noway_assert((callTargetMask & killMask) == callTargetMask);
+ }
+ else
+ {
+ // The call target must not overwrite any live variable, though it may not be in the
+ // kill set for the call.
+ regMaskTP callTargetMask = genRegMask(callTargetReg);
+ noway_assert((callTargetMask & regSet.rsMaskVars) == RBM_NONE);
+ }
+#endif
+
+ callTarget = callTargetReg;
+ CodeGen::genSetRegToIcon(callTarget, (ssize_t)pAddr, TYP_I_IMPL);
+ callType = emitter::EC_INDIR_ARD;
+ }
+ }
+
+ getEmitter()->emitIns_Call(callType, compiler->eeFindHelper(helper), INDEBUG_LDISASM_COMMA(nullptr) addr, argSize,
+ retSize FEATURE_UNIX_AMD64_STRUCT_PASSING_ONLY_ARG(EA_UNKNOWN), gcInfo.gcVarPtrSetCur,
+ gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur,
+ BAD_IL_OFFSET, // IL offset
+ callTarget, // ireg
+ REG_NA, 0, 0, // xreg, xmul, disp
+ false, // isJump
+ emitter::emitNoGChelper(helper));
+
+ regTracker.rsTrashRegSet(killMask);
+ regTracker.rsTrashRegsForGCInterruptability();
+}
+
+#if !defined(_TARGET_64BIT_)
+//-----------------------------------------------------------------------------
+//
+// Code Generation for Long integers
+//
+//-----------------------------------------------------------------------------
+
+//------------------------------------------------------------------------
+// genStoreLongLclVar: Generate code to store a non-enregistered long lclVar
+//
+// Arguments:
+// treeNode - A TYP_LONG lclVar node.
+//
+// Return Value:
+// None.
+//
+// Assumptions:
+// 'treeNode' must be a TYP_LONG lclVar node for a lclVar that has NOT been promoted.
+// Its operand must be a GT_LONG node.
+//
+void CodeGen::genStoreLongLclVar(GenTree* treeNode)
+{
+ emitter* emit = getEmitter();
+
+ GenTreeLclVarCommon* lclNode = treeNode->AsLclVarCommon();
+ unsigned lclNum = lclNode->gtLclNum;
+ LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
+ assert(varDsc->TypeGet() == TYP_LONG);
+ assert(!varDsc->lvPromoted);
+ GenTreePtr op1 = treeNode->gtOp.gtOp1;
+ noway_assert(op1->OperGet() == GT_LONG);
+ genConsumeRegs(op1);
+
+ // Definitions of register candidates will have been lowered to 2 int lclVars.
+ assert(!treeNode->InReg());
+
+ GenTreePtr loVal = op1->gtGetOp1();
+ GenTreePtr hiVal = op1->gtGetOp2();
+ // NYI: Contained immediates.
+ NYI_IF((loVal->gtRegNum == REG_NA) || (hiVal->gtRegNum == REG_NA), "Store of long lclVar with contained immediate");
+ emit->emitIns_R_S(ins_Store(TYP_INT), EA_4BYTE, loVal->gtRegNum, lclNum, 0);
+ emit->emitIns_R_S(ins_Store(TYP_INT), EA_4BYTE, hiVal->gtRegNum, lclNum, genTypeSize(TYP_INT));
+}
+#endif // !defined(_TARGET_64BIT_)
+
+/*****************************************************************************
+* Unit testing of the XArch emitter: generate a bunch of instructions into the prolog
+* (it's as good a place as any), then use COMPlus_JitLateDisasm=* to see if the late
+* disassembler thinks the instructions as the same as we do.
+*/
+
+// Uncomment "#define ALL_ARM64_EMITTER_UNIT_TESTS" to run all the unit tests here.
+// After adding a unit test, and verifying it works, put it under this #ifdef, so we don't see it run every time.
+//#define ALL_XARCH_EMITTER_UNIT_TESTS
+
+#if defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_AMD64_)
+void CodeGen::genAmd64EmitterUnitTests()
+{
+ if (!verbose)
+ {
+ return;
+ }
+
+ if (!compiler->opts.altJit)
+ {
+ // No point doing this in a "real" JIT.
+ return;
+ }
+
+ // Mark the "fake" instructions in the output.
+ printf("*************** In genAmd64EmitterUnitTests()\n");
+
+ // We use this:
+ // genDefineTempLabel(genCreateTempLabel());
+ // to create artificial labels to help separate groups of tests.
+
+ //
+ // Loads
+ //
+ CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef ALL_XARCH_EMITTER_UNIT_TESTS
+#ifdef FEATURE_AVX_SUPPORT
+ genDefineTempLabel(genCreateTempLabel());
+
+ // vhaddpd ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_haddpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vaddss xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_addss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vaddsd xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_addsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vaddps xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_addps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vaddps ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_addps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vaddpd xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_addpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vaddpd ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_addpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vsubss xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_subss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vsubsd xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_subsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vsubps ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_subps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vsubps ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_subps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vsubpd xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_subpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vsubpd ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_subpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vmulss xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_mulss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vmulsd xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_mulsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vmulps xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_mulps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vmulpd xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_mulpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vmulps ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_mulps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vmulpd ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_mulpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vandps xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_andps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vandpd xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_andpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vandps ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_andps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vandpd ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_andpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vorps xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_orps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vorpd xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_orpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vorps ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_orps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vorpd ymm0,ymm1,ymm2
+ getEmitter()->emitIns_R_R_R(INS_orpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vdivss xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vdivsd xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vdivss xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vdivsd xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+
+ // vdivss xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_cvtss2sd, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+ // vdivsd xmm0,xmm1,xmm2
+ getEmitter()->emitIns_R_R_R(INS_cvtsd2ss, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+#endif // FEATURE_AVX_SUPPORT
+#endif // ALL_XARCH_EMITTER_UNIT_TESTS
+ printf("*************** End of genAmd64EmitterUnitTests()\n");
+}
+
+#endif // defined(DEBUG) && defined(LATE_DISASM) && defined(_TARGET_AMD64_)
+
+/*****************************************************************************/
+#ifdef DEBUGGING_SUPPORT
+/*****************************************************************************
+ * genSetScopeInfo
+ *
+ * Called for every scope info piece to record by the main genSetScopeInfo()
+ */
+
+void CodeGen::genSetScopeInfo(unsigned which,
+ UNATIVE_OFFSET startOffs,
+ UNATIVE_OFFSET length,
+ unsigned varNum,
+ unsigned LVnum,
+ bool avail,
+ Compiler::siVarLoc& varLoc)
+{
+ /* We need to do some mapping while reporting back these variables */
+
+ unsigned ilVarNum = compiler->compMap2ILvarNum(varNum);
+ noway_assert((int)ilVarNum != ICorDebugInfo::UNKNOWN_ILNUM);
+
+ VarName name = nullptr;
+
+#ifdef DEBUG
+
+ for (unsigned scopeNum = 0; scopeNum < compiler->info.compVarScopesCount; scopeNum++)
+ {
+ if (LVnum == compiler->info.compVarScopes[scopeNum].vsdLVnum)
+ {
+ name = compiler->info.compVarScopes[scopeNum].vsdName;
+ }
+ }
+
+ // Hang on to this compiler->info.
+
+ TrnslLocalVarInfo& tlvi = genTrnslLocalVarInfo[which];
+
+ tlvi.tlviVarNum = ilVarNum;
+ tlvi.tlviLVnum = LVnum;
+ tlvi.tlviName = name;
+ tlvi.tlviStartPC = startOffs;
+ tlvi.tlviLength = length;
+ tlvi.tlviAvailable = avail;
+ tlvi.tlviVarLoc = varLoc;
+
+#endif // DEBUG
+
+ compiler->eeSetLVinfo(which, startOffs, length, ilVarNum, LVnum, name, avail, varLoc);
+}
+#endif // DEBUGGING_SUPPORT
+
+#endif // _TARGET_AMD64_
+
+#endif // !LEGACY_BACKEND