summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Forstall <brucefo@microsoft.com>2016-07-13 16:00:35 -0700
committerBruce Forstall <brucefo@microsoft.com>2016-07-14 11:36:58 -0700
commit7a516a2b03833d3f445c20f52ea41c45e233c2ab (patch)
tree42d058313b8644aced2c81b7c2728ec2254b0a78
parent0a8cae775a27861a37f89b2c042faea90e25cdaa (diff)
downloadcoreclr-7a516a2b03833d3f445c20f52ea41c45e233c2ab.tar.gz
coreclr-7a516a2b03833d3f445c20f52ea41c45e233c2ab.tar.bz2
coreclr-7a516a2b03833d3f445c20f52ea41c45e233c2ab.zip
Clean up localloc implementation; improve AMD64 codegen for initialized locallocs
-rwxr-xr-xsrc/jit/codegenxarch.cpp104
-rw-r--r--src/jit/lowerxarch.cpp9
-rw-r--r--src/jit/target.h2
3 files changed, 65 insertions, 50 deletions
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
index d442a6e858..c7fc1fda73 100755
--- a/src/jit/codegenxarch.cpp
+++ b/src/jit/codegenxarch.cpp
@@ -3046,9 +3046,22 @@ CodeGen::genMultiRegCallStoreToLocal(GenTreePtr treeNode)
}
-/***********************************************************************************************
- * Generate code for localloc
- */
+//------------------------------------------------------------------------
+// genLclHeap: Generate code for localloc.
+//
+// Arguments:
+// tree - the localloc tree to generate.
+//
+// Notes:
+// Note that for x86, we don't track ESP movements while generating the localloc code.
+// The ESP tracking is used to report stack pointer-relative GC info, which is not
+// interesting while doing the localloc construction. Also, for functions with localloc,
+// we have EBP frames, and EBP-relative locals, and ESP-relative accesses only for function
+// call arguments. We store the ESP after the localloc is complete in the LocAllocSP
+// variable. This variable is implicitly reported to the VM in the GC info (its position
+// is defined by convention relative to other items), and is used by the GC to find the
+// "base" stack pointer in functions with localloc.
+//
void
CodeGen::genLclHeap(GenTreePtr tree)
{
@@ -3106,7 +3119,9 @@ CodeGen::genLclHeap(GenTreePtr tree)
}
else
{
- // If 0 bail out by returning null in targetReg
+ // The localloc requested memory size is non-constant.
+
+ // Put the size value in targetReg. If it is zero, bail out by returning null in targetReg.
genConsumeRegAndCopy(size, targetReg);
endLabel = genCreateTempLabel();
getEmitter()->emitIns_R_R(INS_test, easz, targetReg, targetReg);
@@ -3127,33 +3142,40 @@ CodeGen::genLclHeap(GenTreePtr tree)
tmpRegsMask &= ~regCntMask;
regCnt = genRegNumFromMask(regCntMask);
if (regCnt != targetReg)
+ {
+ // Above, we put the size in targetReg. Now, copy it to our new temp register if necessary.
inst_RV_RV(INS_mov, regCnt, targetReg, size->TypeGet());
+ }
}
- // Align to STACK_ALIGN
- // regCnt will be the total number of bytes to localloc
- inst_RV_IV(INS_add, regCnt, (STACK_ALIGN - 1), emitActualTypeSize(type));
+ // Round up the number of bytes to allocate to a STACK_ALIGN boundary. This is done
+ // by code like:
+ // add reg, 15
+ // and reg, -16
+ // However, in the initialized memory case, we need the count of STACK_ALIGN-sized
+ // elements, not a byte count, after the alignment. So instead of the "and", which
+ // becomes unnecessary, generate a shift, e.g.:
+ // add reg, 15
+ // shr reg, 4
+
+ inst_RV_IV(INS_add, regCnt, STACK_ALIGN - 1, emitActualTypeSize(type));
-#if defined(_TARGET_X86_)
- // TODO-Cleanup: change amd64 to use the same code path as x86 to reduce #ifdefs
- // and improve amd64 CQ (use a dec loop instead of sub rsp loop).
if (compiler->info.compInitMem)
{
- // Convert the count from a count of bytes to a count of pointer-sized words.
- // We don't need the 'and' because we'll shift off those bits anyway. That's
- // asserted by the following.
- C_ASSERT((STACK_ALIGN >> STACK_ALIGN_SHIFT) <= 1);
+ // Convert the count from a count of bytes to a loop count. We will loop once per
+ // stack alignment size, so each loop will zero 4 bytes on x86 and 16 bytes on x64.
+ // Note that we zero a single reg-size word per iteration on x86, and 2 reg-size
+ // words per iteration on x64. We will shift off all the stack alignment bits
+ // added above, so there is no need for an 'and' instruction.
- // --- shr regCnt, 2 ---
- inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT);
+ // --- shr regCnt, 2 (or 4) ---
+ inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT_ALL);
}
else
{
+ // Otherwise, mask off the low bits to align the byte count.
inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type));
}
-#else // !_TARGET_X86_
- inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type));
-#endif // !_TARGET_X86_
}
#if FEATURE_FIXED_OUT_ARGS
@@ -3179,18 +3201,17 @@ CodeGen::genLclHeap(GenTreePtr tree)
{
// We should reach here only for non-zero, constant size allocations.
assert(amount > 0);
+ assert((amount % STACK_ALIGN) == 0);
+ assert((amount % REGSIZE_BYTES) == 0);
// For small allocations we will generate up to six push 0 inline
- size_t cntPtrSizedWords = (amount >> STACK_ALIGN_SHIFT);
- if (cntPtrSizedWords <= 6)
+ size_t cntRegSizedWords = amount / REGSIZE_BYTES;
+ if (cntRegSizedWords <= 6)
{
- while (cntPtrSizedWords != 0)
+ for (; cntRegSizedWords != 0; cntRegSizedWords--)
{
- // push_hide means don't track the stack
- inst_IV(INS_push_hide, 0);
- cntPtrSizedWords--;
+ inst_IV(INS_push_hide, 0); // push_hide means don't track the stack
}
-
goto ALLOC_DONE;
}
@@ -3246,14 +3267,12 @@ CodeGen::genLclHeap(GenTreePtr tree)
// else, "mov regCnt, amount"
-#if defined(_TARGET_X86_)
if (compiler->info.compInitMem)
{
- // For x86, when initializing memory with a constant count, we want 'amount' to be the
- // count of pointer-sized words, not bytes.
- amount = cntPtrSizedWords;
+ // When initializing memory, we want 'amount' to be the loop count.
+ assert((amount % STACK_ALIGN) == 0);
+ amount /= STACK_ALIGN;
}
-#endif // _TARGET_X86_
genSetRegToIcon(regCnt, amount, ((int)amount == amount)? TYP_INT : TYP_LONG);
}
@@ -3261,12 +3280,10 @@ CodeGen::genLclHeap(GenTreePtr tree)
loop = genCreateTempLabel();
if (compiler->info.compInitMem)
{
- // At this point 'regCnt' is set to the total number of bytes (or words, for constant counts) to locAlloc.
+ // At this point 'regCnt' is set to the number of loop iterations for this loop, if each
+ // iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes.
// Since we have to zero out the allocated memory AND ensure that RSP is always valid
// by tickling the pages, we will just push 0's on the stack.
- //
- // Note: regCnt is guaranteed to be even on Amd64 since STACK_ALIGN/TARGET_POINTER_SIZE = 2
- // and localloc size is a multiple of STACK_ALIGN.
assert(genIsValidIntReg(regCnt));
@@ -3274,23 +3291,18 @@ CodeGen::genLclHeap(GenTreePtr tree)
genDefineTempLabel(loop);
#if defined(_TARGET_AMD64_)
- // dec is a 2 byte instruction, but sub is 4 (could be 3 if
- // we know size is TYP_INT instead of TYP_I_IMPL)
- // Also we know that we can only push 8 bytes at a time, but
- // alignment is 16 bytes, so we can push twice and do a sub
- // for just a little bit of loop unrolling
+ // Push two 8-byte zeros. This matches the 16-byte STACK_ALIGN value.
+ static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));
inst_IV(INS_push_hide, 0); // --- push 8-byte 0
inst_IV(INS_push_hide, 0); // --- push 8-byte 0
-
- // Note that regCnt is the number of bytes to stack allocate.
- // Therefore we need to subtract 16 from regcnt here.
- inst_RV_IV(INS_sub, regCnt, 16, emitActualTypeSize(type));
#elif defined(_TARGET_X86_)
+ // Push a single 4-byte zero. This matches the 4-byte STACK_ALIGN value.
+ static_assert_no_msg(STACK_ALIGN == REGSIZE_BYTES);
inst_IV(INS_push_hide, 0); // --- push 4-byte 0
- inst_RV(INS_dec, regCnt, TYP_I_IMPL);
#endif // _TARGET_X86_
- // If not done, loop
+ // Decrement the loop counter and loop if not done.
+ inst_RV(INS_dec, regCnt, TYP_I_IMPL);
inst_JMP(EJ_jne, loop);
}
else
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 585f97b3c6..d41239cc40 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -1901,8 +1901,8 @@ Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
//
// Size? Init Memory? # temp regs
// 0 - 0
- // const and <=6 ptr words - 0
- // const and >6 ptr words Yes 0
+ // const and <=6 reg words - 0
+ // const and >6 reg words Yes 0
// const and <PageSize No 0 (amd64) 1 (x86)
// const and >=PageSize No 2
// Non-const Yes 0
@@ -1925,11 +1925,12 @@ Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
// Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
// This should also help in debugging as we can examine the original size specified with localloc.
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
- size_t cntStackAlignedWidthItems = (sizeVal >> STACK_ALIGN_SHIFT);
// For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
// we will generate 'push 0'.
- if (cntStackAlignedWidthItems <= 6)
+ assert((sizeVal % REGSIZE_BYTES) == 0);
+ size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
+ if (cntRegSizedWords <= 6)
{
info->internalIntCount = 0;
}
diff --git a/src/jit/target.h b/src/jit/target.h
index aaddcba34d..dcb18dbe32 100644
--- a/src/jit/target.h
+++ b/src/jit/target.h
@@ -485,6 +485,7 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits
#define CODE_ALIGN 1 // code alignment requirement
#define STACK_ALIGN 4 // stack alignment requirement
#define STACK_ALIGN_SHIFT 2 // Shift-right amount to convert stack size in bytes to size in DWORD_PTRs
+ #define STACK_ALIGN_SHIFT_ALL 2 // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units
#define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ESI|RBM_EDI)
#define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_ECX|RBM_EDX)
@@ -780,6 +781,7 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits
#define CODE_ALIGN 1 // code alignment requirement
#define STACK_ALIGN 16 // stack alignment requirement
#define STACK_ALIGN_SHIFT 3 // Shift-right amount to convert stack size in bytes to size in pointer sized words
+ #define STACK_ALIGN_SHIFT_ALL 4 // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units
#if ETW_EBP_FRAMED
#define RBM_ETW_FRAMED_EBP RBM_NONE