diff options
author | Bruce Forstall <brucefo@microsoft.com> | 2016-07-12 16:38:17 -0700 |
---|---|---|
committer | Bruce Forstall <brucefo@microsoft.com> | 2016-07-13 13:26:35 -0700 |
commit | b68469c204678cc8f98609d0b5313b87e2b2f120 (patch) | |
tree | 921ba34d90dd4ac6acf2ff7199634e66ffe2d1ae /src | |
parent | e37cd0beda29f56f2826b5168ba6240c94145876 (diff) | |
download | coreclr-b68469c204678cc8f98609d0b5313b87e2b2f120.tar.gz coreclr-b68469c204678cc8f98609d0b5313b87e2b2f120.tar.bz2 coreclr-b68469c204678cc8f98609d0b5313b87e2b2f120.zip |
Implement localloc for RyuJIT/x86
Fixes #4182
Change RyuJIT to generate localloc code for x86 the same way legacy JIT does.
E.g., use a push/dec loop for initialized memory. And avoid using "sub esp"
to keep the emitter from tracking stack pointer adjustments.
Diffstat (limited to 'src')
-rwxr-xr-x | src/jit/codegenxarch.cpp | 100 | ||||
-rw-r--r-- | src/jit/lowerxarch.cpp | 6 |
2 files changed, 86 insertions, 20 deletions
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index 37d5457ccb..d442a6e858 100755 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -3052,7 +3052,6 @@ CodeGen::genMultiRegCallStoreToLocal(GenTreePtr treeNode) void CodeGen::genLclHeap(GenTreePtr tree) { - NYI_X86("Localloc"); assert(tree->OperGet() == GT_LCLHEAP); assert(compiler->compLocallocUsed); @@ -3134,7 +3133,27 @@ CodeGen::genLclHeap(GenTreePtr tree) // Align to STACK_ALIGN // regCnt will be the total number of bytes to localloc inst_RV_IV(INS_add, regCnt, (STACK_ALIGN - 1), emitActualTypeSize(type)); + +#if defined(_TARGET_X86_) + // TODO-Cleanup: change amd64 to use the same code path as x86 to reduce #ifdefs + // and improve amd64 CQ (use a dec loop instead of sub rsp loop). + if (compiler->info.compInitMem) + { + // Convert the count from a count of bytes to a count of pointer-sized words. + // We don't need the 'and' because we'll shift off those bits anyway. That's + // asserted by the following. + C_ASSERT((STACK_ALIGN >> STACK_ALIGN_SHIFT) <= 1); + + // --- shr regCnt, 2 --- + inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT); + } + else + { + inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type)); + } +#else // !_TARGET_X86_ inst_RV_IV(INS_AND, regCnt, ~(STACK_ALIGN - 1), emitActualTypeSize(type)); +#endif // !_TARGET_X86_ } #if FEATURE_FIXED_OUT_ARGS @@ -3174,61 +3193,104 @@ CodeGen::genLclHeap(GenTreePtr tree) goto ALLOC_DONE; } - else if (!compiler->info.compInitMem && (amount < compiler->eeGetPageSize())) // must be < not <= + + bool doNoInitLessThanOnePageAlloc = !compiler->info.compInitMem && (amount < compiler->eeGetPageSize()); // must be < not <= + +#ifdef _TARGET_X86_ + bool needRegCntRegister = true; +#else // !_TARGET_X86_ + bool needRegCntRegister = !doNoInitLessThanOnePageAlloc; +#endif // !_TARGET_X86_ + + if (needRegCntRegister) + { + // If compInitMem=true, we can reuse targetReg as regcnt. + // Since size is a constant, regCnt is not yet initialized. + assert(regCnt == REG_NA); + if (compiler->info.compInitMem) + { + assert(genCountBits(tmpRegsMask) == 0); + regCnt = targetReg; + } + else + { + assert(genCountBits(tmpRegsMask) >= 1); + regMaskTP regCntMask = genFindLowestBit(tmpRegsMask); + tmpRegsMask &= ~regCntMask; + regCnt = genRegNumFromMask(regCntMask); + } + } + + if (doNoInitLessThanOnePageAlloc) { // Since the size is less than a page, simply adjust ESP. // ESP might already be in the guard page, so we must touch it BEFORE // the alloc, not after. + +#ifdef _TARGET_X86_ + // For x86, we don't want to use "sub ESP" because we don't want the emitter to track the adjustment + // to ESP. So do the work in the count register. + // TODO-CQ: manipulate ESP directly, to share code, reduce #ifdefs, and improve CQ. This would require + // creating a way to temporarily turn off the emitter's tracking of ESP, maybe marking instrDescs as "don't track". + inst_RV_RV(INS_mov, regCnt, REG_SPBASE, TYP_I_IMPL); + getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0); + inst_RV_IV(INS_sub, regCnt, amount, EA_PTRSIZE); + inst_RV_RV(INS_mov, REG_SPBASE, regCnt, TYP_I_IMPL); +#else // !_TARGET_X86_ getEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0); inst_RV_IV(INS_sub, REG_SPBASE, amount, EA_PTRSIZE); +#endif // !_TARGET_X86_ + goto ALLOC_DONE; } // else, "mov regCnt, amount" - // If compInitMem=true, we can reuse targetReg as regcnt. - // Since size is a constant, regCnt is not yet initialized. - assert(regCnt == REG_NA); + +#if defined(_TARGET_X86_) if (compiler->info.compInitMem) - { - assert(genCountBits(tmpRegsMask) == 0); - regCnt = targetReg; - } - else { - assert(genCountBits(tmpRegsMask) >= 1); - regMaskTP regCntMask = genFindLowestBit(tmpRegsMask); - tmpRegsMask &= ~regCntMask; - regCnt = genRegNumFromMask(regCntMask); + // For x86, when initializing memory with a constant count, we want 'amount' to be the + // count of pointer-sized words, not bytes. + amount = cntPtrSizedWords; } +#endif // _TARGET_X86_ + genSetRegToIcon(regCnt, amount, ((int)amount == amount)? TYP_INT : TYP_LONG); } loop = genCreateTempLabel(); if (compiler->info.compInitMem) { - // At this point 'regCnt' is set to the total number of bytes to locAlloc. + // At this point 'regCnt' is set to the total number of bytes (or words, for constant counts) to locAlloc. // Since we have to zero out the allocated memory AND ensure that RSP is always valid // by tickling the pages, we will just push 0's on the stack. // // Note: regCnt is guaranteed to be even on Amd64 since STACK_ALIGN/TARGET_POINTER_SIZE = 2 // and localloc size is a multiple of STACK_ALIGN. + assert(genIsValidIntReg(regCnt)); + // Loop: genDefineTempLabel(loop); +#if defined(_TARGET_AMD64_) // dec is a 2 byte instruction, but sub is 4 (could be 3 if // we know size is TYP_INT instead of TYP_I_IMPL) // Also we know that we can only push 8 bytes at a time, but // alignment is 16 bytes, so we can push twice and do a sub // for just a little bit of loop unrolling - inst_IV(INS_push_hide, 0); // --- push 0 - inst_IV(INS_push_hide, 0); // --- push 0 + inst_IV(INS_push_hide, 0); // --- push 8-byte 0 + inst_IV(INS_push_hide, 0); // --- push 8-byte 0 - // If not done, loop // Note that regCnt is the number of bytes to stack allocate. // Therefore we need to subtract 16 from regcnt here. - assert(genIsValidIntReg(regCnt)); inst_RV_IV(INS_sub, regCnt, 16, emitActualTypeSize(type)); +#elif defined(_TARGET_X86_) + inst_IV(INS_push_hide, 0); // --- push 4-byte 0 + inst_RV(INS_dec, regCnt, TYP_I_IMPL); +#endif // _TARGET_X86_ + + // If not done, loop inst_JMP(EJ_jne, loop); } else diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index d05dc5b597..585f97b3c6 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -1903,7 +1903,7 @@ Lowering::TreeNodeInfoInitLclHeap(GenTree* tree) // 0 - 0 // const and <=6 ptr words - 0 // const and >6 ptr words Yes 0 - // const and <PageSize No 0 + // const and <PageSize No 0 (amd64) 1 (x86) // const and >=PageSize No 2 // Non-const Yes 0 // Non-const No 2 @@ -1938,7 +1938,11 @@ Lowering::TreeNodeInfoInitLclHeap(GenTree* tree) // No need to initialize allocated stack space. if (sizeVal < compiler->eeGetPageSize()) { +#ifdef _TARGET_X86_ + info->internalIntCount = 1; // x86 needs a register here to avoid generating "sub" on ESP. +#else // !_TARGET_X86_ info->internalIntCount = 0; +#endif // !_TARGET_X86_ } else { |